fixing bug for ner (#463)

* fixing bug for ner * removing global var * adding class for trial counter * adding notebook * adding use_ray dict * updating documentation for nlp
2022-03-20 22:03:02 -04:00 · 2022-03-20 22:03:02 -04:00 · af423463c3
parent 5f688c1662
commit af423463c3
18 changed files with 1999 additions and 324 deletions
--- a/flaml/automl.py
+++ b/flaml/automl.py
@ -246,6 +246,7 @@ class AutoMLState:
            * sample_size
            / state.data_size[0]
        )
+        # raise Exception("bbbbb", state.time_budget, budget)

        if _is_nlp_task(state.task):
            state.fit_kwargs["X_val"] = state.X_val
@ -326,80 +327,29 @@ class AutoMLState:
            weight = None
        if groups is not None:
            self.fit_kwargs["groups"] = groups
+
        budget = (
            None
            if self.time_budget is None
            else self.time_budget - self.time_from_start
        )
-        if (
-            hasattr(self, "resources_per_trial")
-            and self.resources_per_trial.get("gpu", 0) > 0
-        ):

-            if _is_nlp_task(self.task):
-                use_ray = self.fit_kwargs.get("use_ray")
-                self.fit_kwargs["use_ray"] = True
+        estimator, train_time = train_estimator(
+            X_train=sampled_X_train,
+            y_train=sampled_y_train,
+            config_dic=config,
+            task=self.task,
+            estimator_name=estimator,
+            n_jobs=self.n_jobs,
+            estimator_class=self.learner_classes.get(estimator),
+            budget=budget,
+            fit_kwargs=self.fit_kwargs,
+            eval_metric="train_time",
+        )

-            def _trainable_function_wrapper(config: dict):
-
-                return_estimator, train_time = train_estimator(
-                    X_train=sampled_X_train,
-                    y_train=sampled_y_train,
-                    config_dic=config,
-                    task=self.task,
-                    estimator_name=estimator,
-                    n_jobs=self.n_jobs,
-                    estimator_class=self.learner_classes.get(estimator),
-                    budget=budget,
-                    fit_kwargs=self.fit_kwargs,
-                )
-                return {"estimator": return_estimator, "train_time": train_time}
-
-            if estimator not in self.learner_classes:
-                self.learner_classes[estimator] = get_estimator_class(
-                    self.task, estimator
-                )
-
-            analysis = tune.run(
-                _trainable_function_wrapper,
-                config=config_w_resource,
-                metric="train_time",
-                mode="min",
-                resources_per_trial=self.resources_per_trial,
-                num_samples=1,
-                use_ray=True,
-            )
-            result = list(analysis.results.values())[0]
-            estimator, train_time = result["estimator"], result["train_time"]
-
-            if _is_nlp_task(self.task):
-                if use_ray is None:
-                    del self.fit_kwargs["use_ray"]
-                else:
-                    self.fit_kwargs["use_ray"] = use_ray
-                estimator.use_ray = False
-        else:
-            if _is_nlp_task(self.task):
-                use_ray = self.fit_kwargs.get("use_ray")
-                self.fit_kwargs["use_ray"] = False
-            estimator, train_time = train_estimator(
-                X_train=sampled_X_train,
-                y_train=sampled_y_train,
-                config_dic=config,
-                task=self.task,
-                estimator_name=estimator,
-                n_jobs=self.n_jobs,
-                estimator_class=self.learner_classes.get(estimator),
-                budget=budget,
-                fit_kwargs=self.fit_kwargs,
-            )
-            if _is_nlp_task(self.task):
-                if use_ray is None:
-                    del self.fit_kwargs["use_ray"]
-                else:
-                    self.fit_kwargs["use_ray"] = use_ray
        if sampled_weight is not None:
            self.fit_kwargs["sample_weight"] = weight
+
        return estimator, train_time


@ -749,7 +699,11 @@ class AutoML(BaseEstimator):
        """Time taken to find best model in seconds."""
        return self.__dict__.get("_time_taken_best_iter")

-    def predict(self, X: Union[np.array, pd.DataFrame, List[str], List[List[str]]]):
+    def predict(
+        self,
+        X: Union[np.array, pd.DataFrame, List[str], List[List[str]]],
+        **pred_kwargs,
+    ):
        """Predict label from features.

        Args:
@ -761,6 +715,8 @@ class AutoML(BaseEstimator):
                    arima or sarimax). Other columns in the dataframe
                    are assumed to be exogenous variables (categorical
                    or numeric).
+            **pred_kwargs: Other key word arguments to pass to predict() function of
+                the searched learners, such as per_device_eval_batch_size.

        ```python
        multivariate_X_test = pd.DataFrame({
@ -782,7 +738,7 @@ class AutoML(BaseEstimator):
            )
            return None
        X = self._preprocess(X)
-        y_pred = estimator.predict(X)
+        y_pred = estimator.predict(X, **pred_kwargs)
        if (
            isinstance(y_pred, np.ndarray)
            and y_pred.ndim > 1
@ -796,12 +752,14 @@ class AutoML(BaseEstimator):
        else:
            return y_pred

-    def predict_proba(self, X):
+    def predict_proba(self, X, **pred_kwargs):
        """Predict the probability of each class from features, only works for
        classification problems.

        Args:
            X: A numpy array of featurized instances, shape n * m.
+            **pred_kwargs: Other key word arguments to pass to predict_proba() function of
+                the searched learners, such as per_device_eval_batch_size.

        Returns:
            A numpy array of shape n * c. c is the  # classes. Each element at
@ -814,7 +772,7 @@ class AutoML(BaseEstimator):
            )
            return None
        X = self._preprocess(X)
-        proba = self._trained_estimator.predict_proba(X)
+        proba = self._trained_estimator.predict_proba(X, **pred_kwargs)
        return proba

    def _preprocess(self, X):
@ -1319,6 +1277,7 @@ class AutoML(BaseEstimator):
            task=task,
            estimator_name=estimator,
            estimator_class=self._state.learner_classes.get(estimator),
+            eval_metric="train_time",
        )
        return estimator

@ -1680,6 +1639,17 @@ class AutoML(BaseEstimator):
        """
        return self._state.data_size[0] if self._sample else None

+    def pickle(self, output_file_name):
+        import pickle
+
+        estimator_to_training_function = {}
+        for estimator in self.estimator_list:
+            search_state = self._search_states[estimator]
+            estimator_to_training_function[estimator] = search_state.training_function
+            del search_state.training_function
+        with open(output_file_name, "wb") as f:
+            pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
+
    @property
    def trainable(self) -> Callable[[dict], Optional[float]]:
        """Training function.
@ -1960,10 +1930,10 @@ class AutoML(BaseEstimator):
                augment rare classes.
            min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
                size when sample=True.
-            use_ray: boolean, default=False | Whether to use ray to run the training
+            use_ray: boolean or dict
+                If boolean: default=False | Whether to use ray to run the training
                in separate processes. This can be used to prevent OOM for large
-                datasets, but will incur more overhead in time. Only use it if
-                you run into OOM failures.
+                datasets, but will incur more overhead in time.
            metric_constraints: list, default=[] | The list of metric constraints.
                Each element in this list is a 3-tuple, which shall be expressed
                in the following format: the first element of the 3-tuple is the name of the
@ -2064,14 +2034,21 @@ class AutoML(BaseEstimator):
            import ray

            n_cpus = use_ray and ray.available_resources()["CPU"] or os.cpu_count()
+
            self._state.resources_per_trial = (
                # when using gpu, default cpu is 1 per job; otherwise, default cpu is n_cpus / n_concurrent_trials
-                {"cpu": max(int(n_cpus / n_concurrent_trials), 1), "gpu": gpu_per_trial}
-                if gpu_per_trial == 0
-                else {"cpu": 1, "gpu": gpu_per_trial}
+                (
+                    {
+                        "cpu": max(int((n_cpus - 2) / 2 / n_concurrent_trials), 1),
+                        "gpu": gpu_per_trial,
+                    }
+                    if gpu_per_trial == 0
+                    else {"cpu": 1, "gpu": gpu_per_trial}
+                )
                if n_jobs < 0
                else {"cpu": n_jobs, "gpu": gpu_per_trial}
            )
+
            if isinstance(X_train, ray.ObjectRef):
                X_train = ray.get(X_train)
            elif isinstance(dataframe, ray.ObjectRef):
@ -2131,7 +2108,11 @@ class AutoML(BaseEstimator):
            )
        )
        if "auto" == metric:
-            if "binary" in self._state.task:
+            if _is_nlp_task(self._state.task):
+                from .nlp.utils import load_default_huggingface_metric_for_task
+
+                metric = load_default_huggingface_metric_for_task(self._state.task)
+            elif "binary" in self._state.task:
                metric = "roc_auc"
            elif "multi" in self._state.task:
                metric = "log_loss"
@ -2139,17 +2120,9 @@ class AutoML(BaseEstimator):
                metric = "mape"
            elif self._state.task == "rank":
                metric = "ndcg"
-            elif _is_nlp_task(self._state.task):
-                from .nlp.utils import load_default_huggingface_metric_for_task
-
-                metric = load_default_huggingface_metric_for_task(self._state.task)
            else:
                metric = "r2"

-        if _is_nlp_task(self._state.task):
-            self._state.fit_kwargs["metric"] = metric
-            self._state.fit_kwargs["use_ray"] = self._use_ray
-
        self._state.metric = metric

        def is_to_reverse_metric(metric, task):
@ -2355,6 +2328,14 @@ class AutoML(BaseEstimator):
        elif "random" == self._hpo_method:
            from ray.tune.suggest import BasicVariantGenerator as SearchAlgo
            from ray.tune.sample import Domain
+        elif "optuna" == self._hpo_method:
+            try:
+                from ray import __version__ as ray_version
+
+                assert ray_version >= "1.0.0"
+                from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
+            except (ImportError, AssertionError):
+                from .searcher.suggestion import OptunaSearch as SearchAlgo
        else:
            raise NotImplementedError(
                f"hpo_method={self._hpo_method} is not recognized. "
@ -2382,24 +2363,48 @@ class AutoML(BaseEstimator):
        else:
            self._state.time_from_start = time.time() - self._start_time_flag
            time_left = self._state.time_budget - self._state.time_from_start
-            search_alg = SearchAlgo(
-                metric="val_loss",
-                space=space,
-                low_cost_partial_config=self.low_cost_partial_config,
-                points_to_evaluate=self.points_to_evaluate,
-                cat_hp_cost=self.cat_hp_cost,
-                resource_attr=self.resource_attr,
-                min_resource=self.min_resource,
-                max_resource=self.max_resource,
-                config_constraints=[
-                    (partial(size, self._state), "<=", self._mem_thres)
-                ],
-                metric_constraints=self.metric_constraints,
-                seed=self._seed,
-                time_budget_s=time_left,
-            )
+            if self._hpo_method != "optuna":
+                search_alg = SearchAlgo(
+                    metric="val_loss",
+                    space=space,
+                    low_cost_partial_config=self.low_cost_partial_config,
+                    points_to_evaluate=self.points_to_evaluate,
+                    cat_hp_cost=self.cat_hp_cost,
+                    resource_attr=self.resource_attr,
+                    min_resource=self.min_resource,
+                    max_resource=self.max_resource,
+                    config_constraints=[
+                        (partial(size, self._state), "<=", self._mem_thres)
+                    ],
+                    metric_constraints=self.metric_constraints,
+                    seed=self._seed,
+                    time_budget_s=time_left,
+                )
+            else:
+                # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match
+                # need to remove the extra keys from the search space to be consistent with the initial config
+                converted_space = SearchAlgo.convert_search_space(space)
+
+                removed_keys = set(space.keys()).difference(converted_space.keys())
+                new_points_to_evaluate = []
+                for idx in range(len(self.points_to_evaluate)):
+                    r = self.points_to_evaluate[idx].copy()
+                    for each_key in removed_keys:
+                        r.pop(each_key)
+                    new_points_to_evaluate.append(r)
+
+                search_alg = SearchAlgo(
+                    metric="val_loss",
+                    mode="min",
+                    points_to_evaluate=[
+                        p
+                        for p in new_points_to_evaluate
+                        if len(p) == len(converted_space)
+                    ],
+                )
            search_alg = ConcurrencyLimiter(search_alg, self._n_concurrent_trials)
        resources_per_trial = self._state.resources_per_trial
+
        analysis = ray.tune.run(
            self.trainable,
            search_alg=search_alg,
@ -2413,6 +2418,7 @@ class AutoML(BaseEstimator):
            raise_on_failed_trial=False,
            keep_checkpoints_num=1,
            checkpoint_score_attr="min-val_loss",
+            **self._use_ray if isinstance(self._use_ray, dict) else {},
        )
        # logger.info([trial.last_result for trial in analysis.trials])
        trials = sorted(
@ -2579,6 +2585,7 @@ class AutoML(BaseEstimator):
                        if isinstance(search_state.init_config, list)
                        else [search_state.init_config]
                    )
+
                    low_cost_partial_config = search_state.low_cost_partial_config
                if self._hpo_method in ("bs", "cfo", "grid", "cfocat", "random"):
                    algo = SearchAlgo(
@ -2598,6 +2605,20 @@ class AutoML(BaseEstimator):
                        seed=self._seed,
                    )
                else:
+                    # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match
+                    # need to remove the extra keys from the search space to be consistent with the initial config
+                    converted_space = SearchAlgo.convert_search_space(search_space)
+                    removed_keys = set(search_space.keys()).difference(
+                        converted_space.keys()
+                    )
+                    new_points_to_evaluate = []
+                    for idx in range(len(points_to_evaluate)):
+                        r = points_to_evaluate[idx].copy()
+                        for each_key in removed_keys:
+                            r.pop(each_key)
+                        new_points_to_evaluate.append(r)
+                    points_to_evaluate = new_points_to_evaluate
+
                    algo = SearchAlgo(
                        metric="val_loss",
                        mode="min",
--- a/flaml/ml.py
+++ b/flaml/ml.py
@ -397,6 +397,7 @@ def get_val_loss(
    #     fit_kwargs['groups_val'] = groups_val
    #     fit_kwargs['X_val'] = X_val
    #     fit_kwargs['y_val'] = y_val
+
    estimator.fit(X_train, y_train, budget, **fit_kwargs)
    val_loss, metric_for_logging, pred_time, _ = _eval_estimator(
        config,
@ -561,6 +562,10 @@ def compute_estimator(
        task=task,
        n_jobs=n_jobs,
    )
+
+    if isinstance(estimator, TransformersEstimator):
+        fit_kwargs["metric"] = eval_metric
+
    if "holdout" == eval_method:
        val_loss, metric_for_logging, train_time, pred_time = get_val_loss(
            config_dic,
@ -604,6 +609,7 @@ def train_estimator(
    estimator_class=None,
    budget=None,
    fit_kwargs={},
+    eval_metric=None,
 ):
    start_time = time.time()
    estimator_class = estimator_class or get_estimator_class(task, estimator_name)
@ -612,6 +618,9 @@ def train_estimator(
        task=task,
        n_jobs=n_jobs,
    )
+    if isinstance(estimator, TransformersEstimator):
+        fit_kwargs["metric"] = eval_metric
+
    if X_train is not None:
        train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
    else:
--- a/flaml/model.py
+++ b/flaml/model.py
@ -197,7 +197,7 @@ class BaseEstimator:
            train_time = self._fit(X_train, y_train, **kwargs)
        return train_time

-    def predict(self, X):
+    def predict(self, X, **kwargs):
        """Predict label from features.

        Args:
@ -216,7 +216,7 @@ class BaseEstimator:
            )
            return np.ones(X.shape[0])

-    def predict_proba(self, X):
+    def predict_proba(self, X, **kwargs):
        """Predict the probability of each class from features.

        Only works for classification problems
@ -325,7 +325,7 @@ class TransformersEstimator(BaseEstimator):
            },
            "num_train_epochs": {
                "domain": tune.loguniform(lower=0.1, upper=10.0),
-                "init_value": 3,
+                "init_value": 1,
            },
            "per_device_train_batch_size": {
                "domain": tune.choice([4, 8, 16, 32]),
@ -344,33 +344,38 @@ class TransformersEstimator(BaseEstimator):
                "init_value": 1e-6,
            },
            "seed": {"domain": tune.choice(list(range(40, 45))), "init_value": 42},
-            "global_max_steps": {"domain": sys.maxsize, "init_value": sys.maxsize},
+            "global_max_steps": {
+                "domain": sys.maxsize,
+                "init_value": sys.maxsize,
+            },
        }

-        if task in NLG_TASKS:
-            search_space_dict["generation_num_beams"] = {
-                "domain": tune.randint(2, 5),
-                "init_value": 3,
-            }
-            search_space_dict["generation_max_length"] = {
-                "domain": tune.choice([16, 32, 64, 128]),
-                "init_value": 64,
-            }
-
        return search_space_dict

-    def _init_hpo_args(self, automl_fit_kwargs: dict = None):
-        from .nlp.utils import HPOArgs
+    def _init_hf_args(self, automl_fit_kwargs: dict = None):
+        from .nlp.utils import HFArgs

-        custom_hpo_args = HPOArgs()
-        for key, val in automl_fit_kwargs["custom_hpo_args"].items():
+        hf_args = HFArgs()
+        for key, val in automl_fit_kwargs["hf_args"].items():
            assert (
-                key in custom_hpo_args.__dict__
-            ), "The specified key {} is not in the argument list of flaml.nlp.utils::HPOArgs".format(
+                key in hf_args.__dict__
+            ), "The specified key {} is not in the argument list of flaml.nlp.utils::HFArgs".format(
                key
            )
-            setattr(custom_hpo_args, key, val)
-        self.custom_hpo_args = custom_hpo_args
+            setattr(hf_args, key, val)
+        self.hf_args = hf_args
+
+    def _update_hf_args(self, automl_pred_kwargs: dict = None):
+        if automl_pred_kwargs:
+            hf_args = automl_pred_kwargs.get("hf_args")
+            if hf_args:
+                for key, val in hf_args.items():
+                    assert (
+                        key in self.hf_args.__dict__
+                    ), "The specified key {} is not in the argument list of flaml.nlp.utils::HFArgs".format(
+                        key
+                    )
+                    setattr(self.hf_args, key, val)

    def _preprocess(self, X, y=None, **kwargs):
        from .nlp.utils import tokenize_text, is_a_list_of_str
@ -383,7 +388,7 @@ class TransformersEstimator(BaseEstimator):
                X=X,
                Y=y,
                task=self._task,
-                custom_hpo_args=self.custom_hpo_args,
+                hf_args=self.hf_args,
                tokenizer=self._tokenizer,
            )
        else:
@ -392,12 +397,63 @@ class TransformersEstimator(BaseEstimator):
    def _model_init(self, num_labels, per_model_config):
        from .nlp.utils import load_model

-        return load_model(
-            checkpoint_path=self.custom_hpo_args.model_path,
+        this_model = load_model(
+            checkpoint_path=self.hf_args.model_path,
            task=self._task,
            num_labels=num_labels,
            per_model_config=per_model_config,
        )
+        return this_model
+
+    def _get_training_args(self, local_rank=-1):
+        import transformers
+
+        if self._task in NLG_TASKS:
+            self._training_args_config["predict_with_generate"] = True
+
+        if transformers.__version__.startswith("3"):
+            training_args = self._TrainingArguments(
+                report_to=[],
+                output_dir=self._trial_dir,
+                do_train=True,
+                do_eval=True,
+                eval_steps=self._ckpt_freq,
+                evaluate_during_training=True,
+                save_steps=self._ckpt_freq,
+                logging_steps=self._ckpt_freq,
+                save_total_limit=0,
+                metric_for_best_model="loss",
+                fp16=self.hf_args.fp16
+                if self._kwargs.get("gpu_per_trial") > 0
+                else False,
+                no_cuda=True if self._kwargs.get("gpu_per_trial") == 0 else False,
+                local_rank=local_rank,
+                per_device_eval_batch_size=self.hf_args.per_device_eval_batch_size,
+                **self._training_args_config,
+            )
+        else:
+            from transformers import IntervalStrategy
+
+            training_args = self._TrainingArguments(
+                report_to=[],
+                output_dir=self._trial_dir,
+                do_train=True,
+                do_eval=True,
+                eval_steps=self._ckpt_freq,
+                logging_steps=self._ckpt_freq,
+                evaluation_strategy=IntervalStrategy.STEPS,
+                save_steps=self._ckpt_freq,
+                save_total_limit=0,
+                metric_for_best_model="loss",
+                fp16=self.hf_args.fp16
+                if self._kwargs.get("gpu_per_trial") > 0
+                else False,
+                local_rank=local_rank,
+                no_cuda=True if self._kwargs.get("gpu_per_trial") == 0 else False,
+                per_device_eval_batch_size=self.hf_args.per_device_eval_batch_size,
+                **self._training_args_config,
+            )
+        return training_args

    def fit(self, X_train: DataFrame, y_train: Series, budget=None, **kwargs):
        import transformers
@ -411,18 +467,11 @@ class TransformersEstimator(BaseEstimator):
        from .nlp.utils import (
            get_num_labels,
            separate_config,
-            load_model,
            compute_checkpoint_freq,
-            get_trial_fold_name,
+            Counter,
            date_str,
        )

-        # TODO: if self._task == QUESTIONANSWERING, uncomment the code below (add indentation before
-        #  from .nlp.huggingface.trainer import TrainerForAuto)
-
-        # if self._task in NLG_TASKS:
-        #     from .nlp.huggingface.trainer import Seq2SeqTrainerForAuto as TrainerForAuto
-        # else:
        from .nlp.huggingface.trainer import TrainerForAuto
        from .nlp.huggingface.data_collator import DataCollatorForAuto
        from .nlp.utils import get_auto_tokenizer
@ -462,13 +511,22 @@ class TransformersEstimator(BaseEstimator):

        set_seed(self.params.get("seed", self._TrainingArguments.seed))

-        self._init_hpo_args(kwargs)
+        self._init_hf_args(kwargs)
        self._tokenizer = get_auto_tokenizer(
-            self.custom_hpo_args.model_path, self._task
+            self.hf_args.tokenizer_model_path
+            if self.hf_args.tokenizer_model_path
+            else self.hf_args.model_path,
+            self._task,
        )

        self._metric = kwargs["metric"]
-        self.use_ray = kwargs.get("use_ray")
+
+        try:
+            from ray.tune import is_session_enabled
+
+            self.use_ray = is_session_enabled()
+        except ImportError:
+            self.use_ray = False

        X_val = kwargs.get("X_val")
        y_val = kwargs.get("y_val")
@ -498,70 +556,41 @@ class TransformersEstimator(BaseEstimator):
            eval_dataset = None

        num_labels = get_num_labels(self._task, self._y_train)
-        training_args_config, per_model_config = separate_config(
+        self._training_args_config, self._per_model_config = separate_config(
            self.params, self._task
        )
-        ckpt_freq = compute_checkpoint_freq(
+        self._ckpt_freq = compute_checkpoint_freq(
            train_data_size=len(self._X_train),
-            custom_hpo_args=self.custom_hpo_args,
-            num_train_epochs=training_args_config.get(
+            hf_args=self.hf_args,
+            num_train_epochs=self._training_args_config.get(
                "num_train_epochs", self._TrainingArguments.num_train_epochs
            ),
-            batch_size=training_args_config.get(
+            batch_size=self._training_args_config.get(
                "per_device_train_batch_size",
                self._TrainingArguments.per_device_train_batch_size,
            ),
        )

-        local_dir = os.path.join(
-            self.custom_hpo_args.output_dir, "train_{}".format(date_str())
-        )
+        local_dir = os.path.join(self.hf_args.output_dir, "train_{}".format(date_str()))

-        if not self.use_ray:
-            # if self.params = {}, don't include configuration in trial fold name
-            trial_dir = get_trial_fold_name(local_dir, self.params, self.trial_id)
-        else:
+        if self.use_ray is True:
            import ray

-            trial_dir = ray.tune.get_trial_dir()
-
-        if transformers.__version__.startswith("3"):
-            training_args = self._TrainingArguments(
-                report_to=[],
-                output_dir=trial_dir,
-                do_train=True,
-                do_eval=True,
-                eval_steps=ckpt_freq,
-                evaluate_during_training=True,
-                save_steps=ckpt_freq,
-                logging_steps=ckpt_freq,
-                save_total_limit=0,
-                metric_for_best_model="loss",
-                fp16=self.custom_hpo_args.fp16,
-                **training_args_config,
-            )
+            self._trial_dir = ray.tune.get_trial_dir()
        else:
-            from transformers import IntervalStrategy
-
-            training_args = self._TrainingArguments(
-                report_to=[],
-                output_dir=trial_dir,
-                do_train=True,
-                do_eval=True,
-                per_device_eval_batch_size=1,
-                eval_steps=ckpt_freq,
-                logging_steps=ckpt_freq,
-                evaluation_strategy=IntervalStrategy.STEPS,
-                save_steps=ckpt_freq,
-                save_total_limit=0,
-                metric_for_best_model="loss",
-                fp16=self.custom_hpo_args.fp16,
-                **training_args_config,
+            # if self.params = {}, don't include configuration in trial fold name
+            self._trial_dir = Counter.get_trial_fold_name(
+                local_dir, self.params, self.trial_id
            )

+        self._kwargs = kwargs
+        self._num_labels = num_labels
+
+        training_args = self._get_training_args(local_rank=-1)
+
        self._trainer = TrainerForAuto(
            args=training_args,
-            model_init=partial(self._model_init, num_labels, per_model_config),
+            model_init=partial(self._model_init, num_labels, self._per_model_config),
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=self._tokenizer,
@ -575,28 +604,33 @@ class TransformersEstimator(BaseEstimator):
            callbacks=[EarlyStoppingCallbackForAuto],
        )

-        setattr(self._trainer, "_use_ray", self.use_ray)
        if self._task in NLG_TASKS:
            setattr(self._trainer, "_is_seq2seq", True)
-        if kwargs.get("gpu_per_trial"):
-            self._trainer.args._n_gpu = kwargs.get("gpu_per_trial")
+
+        gpu_per_trial = kwargs.get("gpu_per_trial", None)
+        if gpu_per_trial:
+            tmp_cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
+            self._trainer.args._n_gpu = gpu_per_trial
+            # if gpu_per_trial == 0:
+            #     os.environ["CUDA_VISIBLE_DEVICES"] = ""
+            if tmp_cuda_visible_devices.count(",") != gpu_per_trial - 1:
+                os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
+                    [str(x) for x in range(gpu_per_trial)]
+                )
+
+        import time
+
+        start_time = time.time()
        self._trainer.train()

+        if gpu_per_trial:
+            os.environ["CUDA_VISIBLE_DEVICES"] = tmp_cuda_visible_devices
+
        self.params[self.ITER_HP] = self._trainer.state.global_step
+
        self._checkpoint_path = self._select_checkpoint(self._trainer)
-
-        self._kwargs = kwargs
-        self._num_labels = num_labels
-        self._per_model_config = per_model_config
-        self._training_args_config = training_args_config
-
        self._ckpt_remains = list(self._trainer.ckpt_to_metric.keys())
-        self._model = load_model(
-            checkpoint_path=self._checkpoint_path,
-            task=self._task,
-            num_labels=self._num_labels,
-            per_model_config=self._per_model_config,
-        )
+
        if hasattr(self._trainer, "intermediate_results"):
            self.intermediate_results = [
                x[1]
@ -605,6 +639,7 @@ class TransformersEstimator(BaseEstimator):
                )
            ]
        self._trainer = None
+        return time.time() - start_time

    def _delete_one_ckpt(self, ckpt_location):
        if self.use_ray is False:
@ -689,16 +724,21 @@ class TransformersEstimator(BaseEstimator):
        from datasets import Dataset
        from .nlp.huggingface.trainer import TrainerForAuto
        from .nlp.huggingface.data_collator import DataCollatorForPredict
+        from .nlp.utils import load_model

        X_test, _ = self._preprocess(X_test, **self._kwargs)
        test_dataset = Dataset.from_pandas(X_test)
-        training_args = self._TrainingArguments(
-            per_device_eval_batch_size=1,
-            output_dir=self.custom_hpo_args.output_dir,
-            **self._training_args_config,
+
+        this_model = load_model(
+            checkpoint_path=self._checkpoint_path,
+            task=self._task,
+            num_labels=self._num_labels,
+            per_model_config=self._per_model_config,
        )
-        self._trainer = TrainerForAuto(
-            model=self._model,
+        training_args = self._get_training_args(local_rank=-1)
+
+        new_trainer = TrainerForAuto(
+            model=this_model,
            args=training_args,
            data_collator=DataCollatorForPredict(
                tokenizer=self._tokenizer,
@ -708,31 +748,36 @@ class TransformersEstimator(BaseEstimator):
            else None,
            compute_metrics=self._compute_metrics_by_dataset_name,
        )
-        return test_dataset, training_args
+        if self._task in NLG_TASKS:
+            setattr(new_trainer, "_is_seq2seq", True)
+        return new_trainer, test_dataset, training_args

-    def predict_proba(self, X):
+    def predict_proba(self, X, **kwargs):
+        self._update_hf_args(kwargs)
        assert (
            self._task in CLASSIFICATION
        ), "predict_proba() only for classification tasks."

-        test_dataset, _ = self._init_model_for_predict(X)
-        predictions = self._trainer.predict(test_dataset)
-        if self.use_ray is True:
-            self._trainer = None
+        new_trainer, test_dataset, _ = self._init_model_for_predict(X)
+        predictions = new_trainer.predict(test_dataset)
        return predictions.predictions

-    def predict(self, X):
-        test_dataset, training_args = self._init_model_for_predict(X)
+    def predict(self, X, **kwargs):
+        import transformers
+
+        transformers.logging.set_verbosity_error()
+
+        self._update_hf_args(kwargs)
+        new_trainer, test_dataset, training_args = self._init_model_for_predict(X)
+
        if self._task not in NLG_TASKS:
-            predictions = self._trainer.predict(test_dataset)
+            predictions = new_trainer.predict(test_dataset)
        else:
-            predictions = self._trainer.predict(
+            predictions = new_trainer.predict(
                test_dataset,
-                max_length=training_args.generation_max_length,
-                num_beams=training_args.generation_num_beams,
+                metric_key_prefix="predict",
            )
-        if self.use_ray is True:
-            self._trainer = None
+
        if self._task == SEQCLASSIFICATION:
            return np.argmax(predictions.predictions, axis=1)
        elif self._task == SEQREGRESSION:
@ -740,10 +785,8 @@ class TransformersEstimator(BaseEstimator):
        elif self._task == TOKENCLASSIFICATION:
            return np.argmax(predictions.predictions, axis=2)
        elif self._task == SUMMARIZATION:
-            if isinstance(predictions.predictions, tuple):
-                predictions = np.argmax(predictions.predictions[0], axis=2)
            decoded_preds = self._tokenizer.batch_decode(
-                predictions, skip_special_tokens=True
+                predictions.predictions, skip_special_tokens=True
            )
            return decoded_preds
        elif self._task == MULTICHOICECLASSIFICATION:
@ -1121,7 +1164,7 @@ class XGBoostEstimator(SKLearnEstimator):
        train_time = time.time() - start_time
        return train_time

-    def predict(self, X):
+    def predict(self, X, **kwargs):
        import xgboost as xgb

        if not issparse(X):
@ -1617,7 +1660,7 @@ class Prophet(SKLearnEstimator):
        self._model = model
        return train_time

-    def predict(self, X):
+    def predict(self, X, **kwargs):
        if isinstance(X, int):
            raise ValueError(
                "predict() with steps is only supported for arima/sarimax."
@ -1697,7 +1740,7 @@ class ARIMA(Prophet):
        self._model = model
        return train_time

-    def predict(self, X):
+    def predict(self, X, **kwargs):
        if self._model is not None:
            if isinstance(X, int):
                forecast = self._model.forecast(steps=X)
@ -1894,7 +1937,7 @@ class TS_SKLearn(SKLearnEstimator):
        train_time = time.time() - current_time
        return train_time

-    def predict(self, X):
+    def predict(self, X, **kwargs):
        if self._model is not None:
            X = self.transform_X(X)
            X = self._preprocess(X)
--- a/flaml/nlp/utils.py
+++ b/flaml/nlp/utils.py
@ -2,6 +2,7 @@ import argparse
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Dict, Any
+import numpy as np

 from ..data import (
    SUMMARIZATION,
@ -20,61 +21,54 @@ def load_default_huggingface_metric_for_task(task):
    elif task == SEQREGRESSION:
        return "r2"
    elif task == SUMMARIZATION:
-        return "rouge"
+        return "rouge1"
    elif task == MULTICHOICECLASSIFICATION:
        return "accuracy"
    elif task == TOKENCLASSIFICATION:
        return "seqeval"


-global tokenized_column_names
-
-
-def get_auto_tokenizer(model_path, task):
+def get_auto_tokenizer(tokenizer_model_path, task):
    from transformers import AutoTokenizer

    if task == SUMMARIZATION:
        return AutoTokenizer.from_pretrained(
-            model_path,  # 'roberta-base'
+            pretrained_model_name_or_path=tokenizer_model_path,
            cache_dir=None,
            use_fast=True,
            revision="main",
            use_auth_token=None,
        )
    else:
-        return AutoTokenizer.from_pretrained(model_path, use_fast=True)
+        return AutoTokenizer.from_pretrained(tokenizer_model_path, use_fast=True)


-def tokenize_text(X, Y=None, task=None, custom_hpo_args=None, tokenizer=None):
+def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
    if task in (SEQCLASSIFICATION, SEQREGRESSION):
        X_tokenized = tokenize_onedataframe(
            X,
            tokenizer=tokenizer,
            task=task,
-            custom_hpo_args=custom_hpo_args,
+            hf_args=hf_args,
            prefix_str="",
        )
        return X_tokenized, None
    elif task == TOKENCLASSIFICATION:
        return tokenize_text_tokclassification(
-            X, Y, tokenizer=tokenizer, custom_hpo_args=custom_hpo_args
+            X, Y, tokenizer=tokenizer, hf_args=hf_args
        )
    elif task in NLG_TASKS:
-        return tokenize_seq2seq(
-            X, Y, tokenizer=tokenizer, task=task, custom_hpo_args=custom_hpo_args
-        )
+        return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args)
    elif task == MULTICHOICECLASSIFICATION:
-        return tokenize_text_multiplechoice(
-            X, tokenizer=tokenizer, custom_hpo_args=custom_hpo_args
-        )
+        return tokenize_text_multiplechoice(X, tokenizer=tokenizer, hf_args=hf_args)


-def tokenize_seq2seq(X, Y, tokenizer, task=None, custom_hpo_args=None):
+def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
    model_inputs = tokenize_onedataframe(
        X,
        tokenizer=tokenizer,
        task=task,
-        custom_hpo_args=custom_hpo_args,
+        hf_args=hf_args,
        prefix_str="summarize: ",
    )
    labels = None
@ -83,7 +77,7 @@ def tokenize_seq2seq(X, Y, tokenizer, task=None, custom_hpo_args=None):
            Y.to_frame(),
            tokenizer=tokenizer,
            task=task,
-            custom_hpo_args=custom_hpo_args,
+            hf_args=hf_args,
            prefix_str="",
        )
        labels["label"] = [
@ -97,15 +91,18 @@ def tokenize_seq2seq(X, Y, tokenizer, task=None, custom_hpo_args=None):


 def tokenize_and_align_labels(
-    examples, tokenizer, custom_hpo_args=None, X_sent_key=None, Y_sent_key=None
+    examples,
+    tokenizer,
+    hf_args=None,
+    X_sent_key=None,
+    Y_sent_key=None,
+    return_column_name=False,
 ):
-    global tokenized_column_names
-
    tokenized_inputs = tokenizer(
        [list(examples[X_sent_key])],
        padding="max_length",
        truncation=True,
-        max_length=custom_hpo_args.max_seq_length,
+        max_length=hf_args.max_seq_length,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
@ -134,27 +131,37 @@ def tokenize_and_align_labels(
                #     label_ids.append(b_to_i_label[label_to_id[label[word_idx]]])
            previous_word_idx = word_idx
        tokenized_inputs["label"] = label_ids
-    tokenized_column_names = sorted(tokenized_inputs.keys())
-    tokenized_input_and_labels = [tokenized_inputs[x] for x in tokenized_column_names]
-    for key_idx, each_key in enumerate(tokenized_column_names):
+    tmp_column_names = sorted(tokenized_inputs.keys())
+    tokenized_input_and_labels = [tokenized_inputs[x] for x in tmp_column_names]
+    for key_idx, each_key in enumerate(tmp_column_names):
        if each_key != "label":
            tokenized_input_and_labels[key_idx] = tokenized_input_and_labels[key_idx][0]
-    return tokenized_input_and_labels
+    if return_column_name:
+        return tokenized_input_and_labels, tmp_column_names
+    else:
+        return tokenized_input_and_labels


-def tokenize_text_tokclassification(X, Y, tokenizer, custom_hpo_args=None):
+def tokenize_text_tokclassification(X, Y, tokenizer, hf_args=None):
    import pandas as pd

-    global tokenized_column_names
    if Y is not None:
        X_and_Y = pd.concat([X, Y.to_frame()], axis=1)
        X_key = list(X.keys())[0]
        Y_key = list(Y.to_frame().keys())[0]
+        _, tokenized_column_names = tokenize_and_align_labels(
+            X_and_Y.iloc[0],
+            tokenizer=tokenizer,
+            hf_args=hf_args,
+            X_sent_key=X_key,
+            Y_sent_key=Y_key,
+            return_column_name=True,
+        )
        X_and_Y_tokenized = X_and_Y.apply(
            lambda x: tokenize_and_align_labels(
                x,
                tokenizer=tokenizer,
-                custom_hpo_args=custom_hpo_args,
+                hf_args=hf_args,
                X_sent_key=X_key,
                Y_sent_key=Y_key,
            ),
@ -170,11 +177,21 @@ def tokenize_text_tokclassification(X, Y, tokenizer, custom_hpo_args=None):
        y_tokenized = X_and_Y_tokenized.iloc[:, label_idx]
    else:
        X_key = list(X.keys())[0]
+
+        _, tokenized_column_names = tokenize_and_align_labels(
+            X.iloc[0],
+            tokenizer=tokenizer,
+            hf_args=hf_args,
+            X_sent_key=X_key,
+            Y_sent_key=None,
+            return_column_name=True,
+        )
+
        d = X.apply(
            lambda x: tokenize_and_align_labels(
                x,
                tokenizer=tokenizer,
-                custom_hpo_args=custom_hpo_args,
+                hf_args=hf_args,
                X_sent_key=X_key,
                Y_sent_key=None,
            ),
@ -192,28 +209,34 @@ def tokenize_onedataframe(
    X,
    tokenizer,
    task=None,
-    custom_hpo_args=None,
+    hf_args=None,
    prefix_str=None,
 ):
    import pandas

-    global tokenized_column_names
-
    with tokenizer.as_target_tokenizer():
+        _, tokenized_column_names = tokenize_row(
+            dict(X.iloc[0]),
+            tokenizer,
+            prefix=(prefix_str,) if task is SUMMARIZATION else None,
+            task=task,
+            hf_args=hf_args,
+            return_column_name=True,
+        )
        d = X.apply(
            lambda x: tokenize_row(
                x,
                tokenizer,
                prefix=(prefix_str,) if task is SUMMARIZATION else None,
                task=task,
-                custom_hpo_args=custom_hpo_args,
+                hf_args=hf_args,
            ),
            axis=1,
            result_type="expand",
        )
-    X_tokenized = pandas.DataFrame(columns=tokenized_column_names)
-    X_tokenized[tokenized_column_names] = d
-    return X_tokenized
+        X_tokenized = pandas.DataFrame(columns=tokenized_column_names)
+        X_tokenized[tokenized_column_names] = d
+        return X_tokenized


 def postprocess_text(preds, labels):
@ -230,35 +253,49 @@ def postprocess_text(preds, labels):
    return preds, labels


-def tokenize_row(this_row, tokenizer, prefix=None, task=None, custom_hpo_args=None):
-    global tokenized_column_names
+def tokenize_row(
+    this_row,
+    tokenizer,
+    prefix=None,
+    task=None,
+    hf_args=None,
+    return_column_name=False,
+):
    assert (
-        "max_seq_length" in custom_hpo_args.__dict__
+        "max_seq_length" in hf_args.__dict__
    ), "max_seq_length must be provided for glue"

    if prefix:
        this_row = tuple(["".join(x) for x in zip(prefix, this_row)])

+    # tokenizer.pad_token = tokenizer.eos_token
    tokenized_example = tokenizer(
        *tuple(this_row),
        padding="max_length",
-        max_length=custom_hpo_args.max_seq_length,
+        max_length=hf_args.max_seq_length,
        truncation=True,
    )
    if task in NLG_TASKS:
        tokenized_example["decoder_input_ids"] = tokenized_example["input_ids"]
-    tokenized_column_names = sorted(tokenized_example.keys())
-    return [tokenized_example[x] for x in tokenized_column_names]
+    tmp_column_names = sorted(tokenized_example.keys())
+    if return_column_name:
+        return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
+    else:
+        return [tokenized_example[x] for x in tmp_column_names]


-def tokenize_text_multiplechoice(X, tokenizer, custom_hpo_args=None):
+def tokenize_text_multiplechoice(X, tokenizer, hf_args=None):
    import pandas

-    global tokenized_column_names
-
    t = X[["sent1", "sent2", "ending0", "ending1", "ending2", "ending3"]]
+    _, tokenized_column_names = tokenize_swag(
+        t.iloc[0],
+        tokenizer=tokenizer,
+        hf_args=hf_args,
+        return_column_name=True,
+    )
    d = t.apply(
-        lambda x: tokenize_swag(x, tokenizer, custom_hpo_args),
+        lambda x: tokenize_swag(x, tokenizer=tokenizer, hf_args=hf_args),
        axis=1,
        result_type="expand",
    )
@ -269,9 +306,7 @@ def tokenize_text_multiplechoice(X, tokenizer, custom_hpo_args=None):
    return output, None


-def tokenize_swag(this_row, tokenizer, custom_hpo_args=None):
-    global tokenized_column_names
-
+def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
    first_sentences = [[this_row["sent1"]] * 4]
    # get each 1st sentence, multiply to 4 sentences
    question_headers = this_row["sent2"]
@ -289,11 +324,15 @@ def tokenize_swag(this_row, tokenizer, custom_hpo_args=None):
    tokenized_example = tokenizer(
        *tuple([first_sentences, second_sentences]),
        truncation=True,
-        max_length=custom_hpo_args.max_seq_length,
+        max_length=hf_args.max_seq_length,
        padding=False,
    )
-    tokenized_column_names = sorted(tokenized_example.keys())
-    return [tokenized_example[x] for x in tokenized_column_names]
+    tmp_column_names = sorted(tokenized_example.keys())
+
+    if return_column_name:
+        return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
+    else:
+        return [tokenized_example[x] for x in tmp_column_names]


 def separate_config(config, task):
@ -333,7 +372,9 @@ def get_num_labels(task, y_train):


 def is_a_list_of_str(this_obj):
-    return isinstance(this_obj, list) and all(isinstance(x, str) for x in this_obj)
+    return (isinstance(this_obj, list) or isinstance(this_obj, np.ndarray)) and all(
+        isinstance(x, str) for x in this_obj
+    )


 def _clean_value(value: Any) -> str:
@ -386,14 +427,19 @@ def get_logdir_name(dirname, local_dir):
    return logdir


-def get_trial_fold_name(local_dir, trial_config, trial_id):
-    global counter
-    counter = counter + 1
-    experiment_tag = "{0}_{1}".format(str(counter), format_vars(trial_config))
-    logdir = get_logdir_name(
-        _generate_dirname(experiment_tag, trial_id=trial_id), local_dir
-    )
-    return logdir
+class Counter:
+    counter = 0
+
+    @staticmethod
+    def get_trial_fold_name(local_dir, trial_config, trial_id):
+        Counter.counter += 1
+        experiment_tag = "{0}_{1}".format(
+            str(Counter.counter), format_vars(trial_config)
+        )
+        logdir = get_logdir_name(
+            _generate_dirname(experiment_tag, trial_id=trial_id), local_dir
+        )
+        return logdir


 def load_model(checkpoint_path, task, num_labels, per_model_config=None):
@ -499,7 +545,7 @@ def load_model(checkpoint_path, task, num_labels, per_model_config=None):

 def compute_checkpoint_freq(
    train_data_size,
-    custom_hpo_args,
+    hf_args,
    num_train_epochs,
    batch_size,
 ):
@ -508,7 +554,7 @@ def compute_checkpoint_freq(
            min(num_train_epochs, 1)
            * train_data_size
            / batch_size
-            / custom_hpo_args.ckpt_per_epoch
+            / hf_args.ckpt_per_epoch
        )
        + 1
    )
@ -516,7 +562,7 @@ def compute_checkpoint_freq(


@dataclass
-class HPOArgs:
+class HFArgs:
    """The HPO setting.
    Args:
        output_dir (str): data root directory for outputing the log, etc.
@ -534,7 +580,12 @@ class HPOArgs:

    model_path: str = field(
        default="facebook/muppet-roberta-base",
-        metadata={"help": "model path model for HPO"},
+        metadata={"help": "model path for HPO"},
+    )
+
+    tokenizer_model_path: str = field(
+        default=None,
+        metadata={"help": "tokenizer model path for HPO"},
    )

    fp16: bool = field(default=True, metadata={"help": "whether to use the FP16 mode"})
@ -552,12 +603,17 @@ class HPOArgs:

    ckpt_per_epoch: int = field(default=1, metadata={"help": "checkpoint per epoch"})

+    per_device_eval_batch_size: int = field(
+        default=1,
+        metadata={"help": "per gpu evaluation batch size"},
+    )
+
    @staticmethod
    def load_args():
        from dataclasses import fields

        arg_parser = argparse.ArgumentParser()
-        for each_field in fields(HPOArgs):
+        for each_field in fields(HFArgs):
            print(each_field)
            arg_parser.add_argument(
                "--" + each_field.name,
--- a/flaml/training_log.py
+++ b/flaml/training_log.py
@ -79,7 +79,7 @@ class TrainingLogWriter(object):
        sample_size,
    ):
        if self.file is None:
-            raise IOError("Call open() to open the outpute file first.")
+            raise IOError("Call open() to open the output file first.")
        if validation_loss is None:
            raise ValueError("TEST LOSS NONE ERROR!!!")
        record = TrainingLogRecord(
@ -109,7 +109,7 @@ class TrainingLogWriter(object):

    def checkpoint(self):
        if self.file is None:
-            raise IOError("Call open() to open the outpute file first.")
+            raise IOError("Call open() to open the output file first.")
        if self.current_best_loss_record_id is None:
            logger.warning(
                "flaml.training_log: checkpoint() called before any record is written, skipped."
--- a/notebook/automl_nlp.ipynb
+++ b/notebook/automl_nlp.ipynb
--- a/test/automl/test_notebook_example.py
+++ b/test/automl/test_notebook_example.py
@ -4,12 +4,17 @@ from requests.exceptions import ChunkedEncodingError

 def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
    from flaml.data import load_openml_dataset
+    import urllib3

    try:
        X_train, X_test, y_train, y_test = load_openml_dataset(
            dataset_id=1169, data_dir="test/", dataset_format=dataset_format
        )
-    except (OpenMLServerException, ChunkedEncodingError) as e:
+    except (
+        OpenMLServerException,
+        ChunkedEncodingError,
+        urllib3.exceptions.ReadTimeoutError,
+    ) as e:
        print(e)
        return
    """ import AutoML class from flaml package """
--- a/test/load_args.py
+++ b/test/load_args.py
@ -1,7 +1,7 @@
 def test_load_args_sub():
-    from flaml.nlp.utils import HPOArgs
+    from flaml.nlp.utils import HFArgs

-    HPOArgs.load_args()
+    HFArgs.load_args()


 if __name__ == "__main__":
--- a/test/nlp/test_autohf.py
+++ b/test/nlp/test_autohf.py
@ -84,9 +84,10 @@ def test_hf_data():
        "task": "seq-classification",
        "metric": "accuracy",
        "log_file_name": "seqclass.log",
+        "use_ray": False,
    }

-    automl_settings["custom_hpo_args"] = {
+    automl_settings["hf_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 5,
@ -116,7 +117,6 @@ def test_hf_data():
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    with open("automl.pkl", "rb") as f:
        automl = pickle.load(f)
-    shutil.rmtree("test/data/output/")
    automl.predict(X_test)
    automl.predict(["test test", "test test"])
    automl.predict(
@ -164,7 +164,7 @@ def _test_custom_data():
        "metric": "accuracy",
    }

-    automl_settings["custom_hpo_args"] = {
+    automl_settings["hf_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "data/output/",
        "ckpt_per_epoch": 1,
@ -183,6 +183,16 @@ def _test_custom_data():
        ]
    )

+    import pickle
+
+    automl.pickle("automl.pkl")
+
+    with open("automl.pkl", "rb") as f:
+        automl = pickle.load(f)
+    config = automl.best_config.copy()
+    config["learner"] = automl.best_estimator
+    automl.trainable(config)
+

 if __name__ == "__main__":
    test_hf_data()
--- a/test/nlp/test_autohf_classificationhead.py
+++ b/test/nlp/test_autohf_classificationhead.py
@ -52,7 +52,7 @@ def test_classification_head():
        "metric": "accuracy",
    }

-    automl_settings["custom_hpo_args"] = {
+    automl_settings["hf_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
--- a/test/nlp/test_autohf_custom_metric.py
+++ b/test/nlp/test_autohf_custom_metric.py
@ -19,8 +19,7 @@ def custom_metric(
    from flaml.model import TransformersEstimator

    if estimator._trainer is None:
-        estimator._init_model_for_predict(X_test)
-        trainer = estimator._trainer
+        trainer, _, _ = estimator._init_model_for_predict(X_test)
        estimator._trainer = None
    else:
        trainer = estimator._trainer
@ -103,7 +102,7 @@ def test_custom_metric():
        "log_file_name": "seqclass.log",
    }

-    automl_settings["custom_hpo_args"] = {
+    automl_settings["hf_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "data/output/",
        "ckpt_per_epoch": 1,
--- a/test/nlp/test_autohf_cv.py
+++ b/test/nlp/test_autohf_cv.py
@ -43,7 +43,7 @@ def test_cv():
        "n_splits": 3,
    }

-    automl_settings["custom_hpo_args"] = {
+    automl_settings["hf_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
--- a/test/nlp/test_autohf_multichoice_classification.py
+++ b/test/nlp/test_autohf_multichoice_classification.py
@ -216,7 +216,7 @@ def test_mcc():
        "log_file_name": "seqclass.log",
    }

-    automl_settings["custom_hpo_args"] = {
+    automl_settings["hf_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
--- a/test/nlp/test_autohf_regression.py
+++ b/test/nlp/test_autohf_regression.py
@ -6,6 +6,9 @@ import pytest
 def test_regression():
    try:
        import ray
+
+        if not ray.is_initialized():
+            ray.init()
    except ImportError:
        return
    from flaml import AutoML
@ -65,10 +68,10 @@ def test_regression():
        "task": "seq-regression",
        "metric": "pearsonr",
        "starting_points": {"transformer": {"num_train_epochs": 1}},
-        "use_ray": True,
+        "use_ray": {"local_dir": "data/outut/"},
    }

-    automl_settings["custom_hpo_args"] = {
+    automl_settings["hf_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
@ -77,6 +80,7 @@ def test_regression():

    ray.shutdown()
    ray.init()
+
    automl.fit(
        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
    )
--- a/test/nlp/test_autohf_summarization.py
+++ b/test/nlp/test_autohf_summarization.py
@ -58,7 +58,7 @@ def test_summarization():
        "log_file_name": "seqclass.log",
    }

-    automl_settings["custom_hpo_args"] = {
+    automl_settings["hf_args"] = {
        "model_path": "patrickvonplaten/t5-tiny-random",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
--- a/test/nlp/test_autohf_tokenclassification.py
+++ b/test/nlp/test_autohf_tokenclassification.py
@ -726,7 +726,7 @@ def test_tokenclassification():
        "metric": "seqeval",
    }

-    automl_settings["custom_hpo_args"] = {
+    automl_settings["hf_args"] = {
        "model_path": "bert-base-uncased",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
--- a/test/test_gpu.py
+++ b/test/test_gpu.py
@ -81,7 +81,7 @@ def _test_hf_data():
        "use_ray": True,
    }

-    automl_settings["custom_hpo_args"] = {
+    automl_settings["hf_args"] = {
        "model_path": "facebook/muppet-roberta-base",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 5,
--- a/website/docs/Examples/AutoML-NLP.md
+++ b/website/docs/Examples/AutoML-NLP.md
@ -26,8 +26,8 @@ automl = AutoML()
 automl_settings = {
    "time_budget": 100,
    "task": "seq-classification",
-    "custom_hpo_args": {"output_dir": "data/output/"},
-    "gpu_per_trial": 1,  # set to 0 if no GPU is available
+    "hf_args": {"output_dir": "data/output/"},  # setting the huggingface arguments: output directory
+    "gpu_per_trial": 1,                         # set to 0 if no GPU is available
 }
 automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
 automl.predict(X_test)
@ -77,11 +77,11 @@ automl_settings = {
    "task": "seq-regression",
    "metric": "rmse",
 }
-automl_settings["custom_hpo_args"] = {
-    "model_path": "google/electra-small-discriminator",
-    "output_dir": "data/output/",
-    "ckpt_per_epoch": 5,
-    "fp16": False,
+automl_settings["hf_args"] = {                          # setting the huggingface arguments
+    "model_path": "google/electra-small-discriminator", # setting the language model
+    "output_dir": "data/output/",                       # setting the output directory
+    "ckpt_per_epoch": 5,                                # setting the number of checkpoints per epoch
+    "fp16": False,                                      # setting whether to use FP16
 }
 automl.fit(
    X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
@ -127,11 +127,11 @@ automl_settings = {
    "task": "summarization",
    "metric": "rouge1",
 }
-automl_settings["custom_hpo_args"] = {
-    "model_path": "t5-small",
-    "output_dir": "data/output/",
-    "ckpt_per_epoch": 5,
-    "fp16": False,
+automl_settings["hf_args"] = {            # setting the huggingface arguments
+    "model_path": "t5-small",             # setting the language model
+    "output_dir": "data/output/",         # setting the output directory
+    "ckpt_per_epoch": 5,                  # setting the number of checkpoints per epoch
+    "fp16": False,                        # setting whether to use FP16
 }
 automl.fit(
    X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
@ -205,4 +205,10 @@ Model config T5Config {
 }
 ```

-For tasks that are not currently supported, use `flaml.tune` for [customized tuning](Tune-HuggingFace).
+For tasks that are not currently supported, use `flaml.tune` for [customized tuning](Tune-HuggingFace).
+
+### Link to Jupyter notebook
+
+To run these examples in our Jupyter notebook, please go to:
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_nlp.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_nlp.ipynb)