adding evaluation (#495)

* adding automl.score * fixing the metric name in train_with_config * adding pickle after score * fixing a bug in automl.pickle
2022-03-25 17:00:08 -04:00 · 2022-03-25 17:00:08 -04:00 · 5f97532986
parent 1d029436e7
commit 5f97532986
11 changed files with 375 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -156,3 +156,7 @@ automl.pkl

 .idea/*
 .DS_Store
+
+test/nlp/testtmp.py
+test/nlp/testtmpfl.py
+
--- a/flaml/automl.py
+++ b/flaml/automl.py
@ -246,11 +246,6 @@ class AutoMLState:
            * sample_size
            / state.data_size[0]
        )
-        # raise Exception("bbbbb", state.time_budget, budget)
-
-        if _is_nlp_task(state.task):
-            state.fit_kwargs["X_val"] = state.X_val
-            state.fit_kwargs["y_val"] = state.y_val

        (
            trained_estimator,
@ -344,7 +339,7 @@ class AutoMLState:
            estimator_class=self.learner_classes.get(estimator),
            budget=budget,
            fit_kwargs=self.fit_kwargs,
-            eval_metric="train_time",
+            eval_metric=self.metric if hasattr(self, "metric") else "train_time",
        )

        if sampled_weight is not None:
@ -699,6 +694,16 @@ class AutoML(BaseEstimator):
        """Time taken to find best model in seconds."""
        return self.__dict__.get("_time_taken_best_iter")

+    def score(self, X: pd.DataFrame, y: pd.Series, **kwargs):
+        estimator = getattr(self, "_trained_estimator", None)
+        if estimator is None:
+            logger.warning(
+                "No estimator is trained. Please run fit with enough budget."
+            )
+            return None
+        X = self._preprocess(X)
+        return estimator.score(X, y, **kwargs)
+
    def predict(
        self,
        X: Union[np.array, pd.DataFrame, List[str], List[List[str]]],
@ -1259,7 +1264,7 @@ class AutoML(BaseEstimator):
            record_id: An integer of the record ID in the file,
                0 corresponds to the first trial.
            task: A string of the task type,
-                'binary', 'multi', 'regression', 'ts_forecast', 'rank'.
+                'binary', 'multiclass', 'regression', 'ts_forecast', 'rank'.

        Returns:
            An estimator object for the given configuration.
@ -1645,8 +1650,12 @@ class AutoML(BaseEstimator):
        estimator_to_training_function = {}
        for estimator in self.estimator_list:
            search_state = self._search_states[estimator]
-            estimator_to_training_function[estimator] = search_state.training_function
-            del search_state.training_function
+            if hasattr(search_state, "training_function"):
+                estimator_to_training_function[
+                    estimator
+                ] = search_state.training_function
+                del search_state.training_function
+
        with open(output_file_name, "wb") as f:
            pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)

@ -1781,7 +1790,7 @@ class AutoML(BaseEstimator):
                'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
                'mape'. Default is 'auto'.
                If passing a customized metric function, the function needs to
-                have the follwing signature:
+                have the following signature:
        ```python
        def custom_metric(
            X_test, y_test, estimator, labels,
@ -2114,7 +2123,7 @@ class AutoML(BaseEstimator):
                metric = load_default_huggingface_metric_for_task(self._state.task)
            elif "binary" in self._state.task:
                metric = "roc_auc"
-            elif "multi" in self._state.task:
+            elif "multiclass" in self._state.task:
                metric = "log_loss"
            elif self._state.task in TS_FORECAST:
                metric = "mape"
@ -2838,7 +2847,7 @@ class AutoML(BaseEstimator):
            estimators = []
            if self._ensemble and self._state.task in (
                "binary",
-                "multi",
+                "multiclass",
                "regression",
            ):
                search_states = list(
--- a/flaml/data.py
+++ b/flaml/data.py
@ -18,7 +18,7 @@ MULTICHOICECLASSIFICATION = "multichoice-classification"
 TOKENCLASSIFICATION = "token-classification"
 CLASSIFICATION = (
    "binary",
-    "multi",
+    "multiclass",
    "classification",
    SEQCLASSIFICATION,
    MULTICHOICECLASSIFICATION,
--- a/flaml/default/suggest.py
+++ b/flaml/default/suggest.py
@ -25,7 +25,7 @@ def load_config_predictor(estimator_name, task, location=None):
    predictor = CONFIG_PREDICTORS.get(key)
    if predictor:
        return predictor
-    task = "multiclass" if task == "multi" else task
+    task = "multiclass" if task == "multi" else task  # TODO: multi -> multiclass?
    try:
        location = location or LOCATION
        with open(f"{location}/{estimator_name}/{task}.json", "r") as f:
--- a/flaml/ml.py
+++ b/flaml/ml.py
@ -219,6 +219,13 @@ def is_in_sklearn_metric_name_set(metric_name):
    return metric_name.startswith("ndcg") or metric_name in sklearn_metric_name_set


+def is_min_metric(metric_name):
+    return (
+        metric_name in ["rmse", "mae", "mse", "log_loss", "mape"]
+        or huggingface_metric_to_mode.get(metric_name, None) == "min"
+    )
+
+
 def sklearn_metric_loss_score(
    metric_name,
    y_predict,
@ -565,6 +572,8 @@ def compute_estimator(

    if isinstance(estimator, TransformersEstimator):
        fit_kwargs["metric"] = eval_metric
+        fit_kwargs["X_val"] = X_val
+        fit_kwargs["y_val"] = y_val

    if "holdout" == eval_method:
        val_loss, metric_for_logging, train_time, pred_time = get_val_loss(
@ -633,7 +642,7 @@ def get_classification_objective(num_labels: int) -> str:
    if num_labels == 2:
        objective_name = "binary"
    else:
-        objective_name = "multi"
+        objective_name = "multiclass"
    return objective_name


--- a/flaml/model.py
+++ b/flaml/model.py
@ -88,7 +88,9 @@ class BaseEstimator:

        Args:
            task: A string of the task type, one of
-                'binary', 'multi', 'regression', 'rank', 'forecast'.
+                'binary', 'multiclass', 'regression', 'rank', 'seq-classification',
+                'seq-regression', 'token-classification', 'multichoice-classification',
+                'summarization', 'ts_forecast', 'ts_forecast_classification'.
            config: A dictionary containing the hyperparameter names, 'n_jobs' as keys.
                n_jobs is the number of parallel threads.
        """
@ -234,6 +236,56 @@ class BaseEstimator:
        X = self._preprocess(X)
        return self._model.predict_proba(X)

+    def score(self, X_val: DataFrame, y_val: Series, **kwargs):
+        """Report the evaluation score of a trained estimator.
+
+
+        Args:
+            X_val: A pandas dataframe of the validation input data.
+            y_val: A pandas series of the validation label.
+            kwargs: keyword argument of the evaluation function, for example:
+                - metric: A string of the metric name or a function
+                e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
+                'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
+                'mape'. Default is 'auto'.
+                If metric is given, the score will report the user specified metric.
+                If metric is not given, the metric is set to accuracy for classification and r2
+                for regression.
+                You can also pass a customized metric function, for examples on how to pass a
+                customized metric function, please check
+                [test/nlp/test_autohf_custom_metric.py](https://github.com/microsoft/FLAML/blob/main/test/nlp/test_autohf_custom_metric.py) and
+                [test/automl/test_multiclass.py](https://github.com/microsoft/FLAML/blob/main/test/automl/test_multiclass.py).
+
+                ```
+
+        Returns:
+            The evaluation score on the validation dataset.
+        """
+        from .ml import metric_loss_score
+        from .ml import is_min_metric
+
+        if self._model is not None:
+            if self._task == "rank":
+                raise NotImplementedError(
+                    "AutoML.score() is not implemented for ranking"
+                )
+            else:
+                X_val = self._preprocess(X_val)
+                metric = kwargs.get("metric", None)
+                if metric:
+                    y_pred = self.predict(X_val, **kwargs)
+                    if is_min_metric(metric):
+                        return metric_loss_score(metric, y_pred, y_val)
+                    else:
+                        return 1.0 - metric_loss_score(metric, y_pred, y_val)
+                else:
+                    return self._model.score(X_val, y_val, **kwargs)
+        else:
+            logger.warning(
+                "Estimator is not fit yet. Please run fit() before predict()."
+            )
+            return 0.0
+
    def cleanup(self):
        del self._model
        self._model = None
@ -244,7 +296,7 @@ class BaseEstimator:

        Args:
            data_size: A tuple of two integers, number of rows and columns.
-            task: A str of the task type, e.g., "binary", "multi", "regression".
+            task: A str of the task type, e.g., "binary", "multiclass", "regression".

        Returns:
            A dictionary of the search space.
@ -518,7 +570,6 @@ class TransformersEstimator(BaseEstimator):
            else self.hf_args.model_path,
            self._task,
        )
-
        self._metric = kwargs["metric"]

        try:
@ -720,15 +771,11 @@ class TransformersEstimator(BaseEstimator):
            metric_dict["automl_metric"] = loss
        return metric_dict

-    def _init_model_for_predict(self, X_test):
-        from datasets import Dataset
+    def _init_model_for_predict(self):
        from .nlp.huggingface.trainer import TrainerForAuto
        from .nlp.huggingface.data_collator import DataCollatorForPredict
        from .nlp.utils import load_model

-        X_test, _ = self._preprocess(X_test, **self._kwargs)
-        test_dataset = Dataset.from_pandas(X_test)
-
        this_model = load_model(
            checkpoint_path=self._checkpoint_path,
            task=self._task,
@ -750,25 +797,56 @@ class TransformersEstimator(BaseEstimator):
        )
        if self._task in NLG_TASKS:
            setattr(new_trainer, "_is_seq2seq", True)
-        return new_trainer, test_dataset, training_args
+        return new_trainer, training_args

    def predict_proba(self, X, **kwargs):
+        from datasets import Dataset
+
        self._update_hf_args(kwargs)
        assert (
            self._task in CLASSIFICATION
        ), "predict_proba() only for classification tasks."

-        new_trainer, test_dataset, _ = self._init_model_for_predict(X)
+        X_test, _ = self._preprocess(X, **self._kwargs)
+        test_dataset = Dataset.from_pandas(X_test)
+
+        new_trainer, _ = self._init_model_for_predict()
        predictions = new_trainer.predict(test_dataset)
        return predictions.predictions

+    def score(self, X_val: DataFrame, y_val: Series, **kwargs):
+        import transformers
+        from datasets import Dataset
+
+        transformers.logging.set_verbosity_error()
+
+        self._metric = kwargs["metric"]
+
+        if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
+            self._X_val, _ = self._preprocess(X=X_val)
+            self._y_val = y_val
+        else:
+            self._X_val, self._y_val = self._preprocess(X=X_val, y=y_val)
+
+        eval_dataset = Dataset.from_pandas(
+            TransformersEstimator._join(self._X_val, self._y_val)
+        )
+
+        new_trainer, training_args = self._init_model_for_predict()
+        return new_trainer.evaluate(eval_dataset)
+
    def predict(self, X, **kwargs):
        import transformers
+        from datasets import Dataset

        transformers.logging.set_verbosity_error()

        self._update_hf_args(kwargs)
-        new_trainer, test_dataset, training_args = self._init_model_for_predict(X)
+
+        X_test, _ = self._preprocess(X, **self._kwargs)
+        test_dataset = Dataset.from_pandas(X_test)
+
+        new_trainer, training_args = self._init_model_for_predict()

        if self._task not in NLG_TASKS:
            predictions = new_trainer.predict(test_dataset)
@ -1677,6 +1755,17 @@ class Prophet(SKLearnEstimator):
            )
            return np.ones(X.shape[0])

+    def score(self, X_val: DataFrame, y_val: Series, **kwargs):
+        from sklearn.metrics import r2_score
+        from .ml import metric_loss_score
+
+        y_pred = self.predict(X_val)
+        self._metric = kwargs.get("metric", None)
+        if self._metric:
+            return metric_loss_score(self._metric, y_pred, y_val)
+        else:
+            return r2_score(y_pred, y_val)
+

 class ARIMA(Prophet):
    """The class for tuning ARIMA."""
--- a/notebook/automl_classification.ipynb
+++ b/notebook/automl_classification.ipynb
@ -128,9 +128,9 @@
      "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.9/site-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel->jupyter->flaml[notebook]) (0.8.2)\n",
      "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.9/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->flaml[notebook]) (21.2.0)\n",
      "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.9/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->flaml[notebook]) (0.18.0)\n",
-      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n",
-      "\u001b[33mWARNING: You are using pip version 21.3; however, version 21.3.1 is available.\n",
-      "You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
+      "\u001B[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001B[0m\n",
+      "\u001B[33mWARNING: You are using pip version 21.3; however, version 21.3.1 is available.\n",
+      "You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n"
     ]
    }
   ],
@ -863,7 +863,7 @@
    "        \n",
    "        Args:\n",
    "            task: A string of the task type, one of\n",
-    "                'binary', 'multi', 'regression'\n",
+    "                'binary', 'multiclass', 'regression'\n",
    "            config: A dictionary containing the hyperparameter names\n",
    "                and 'n_jobs' as keys. n_jobs is the number of parallel threads.\n",
    "        '''\n",
@ -1283,4 +1283,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/test/automl/test_multiclass.py
+++ b/test/automl/test_multiclass.py
@ -203,7 +203,7 @@ class TestMultiClass(unittest.TestCase):
        print(automl_experiment.best_estimator)
        automl_experiment = AutoML()
        estimator = automl_experiment.get_estimator_from_log(
-            automl_settings["log_file_name"], record_id=0, task="multi"
+            automl_settings["log_file_name"], record_id=0, task="multiclass"
        )
        print(estimator)
        (
--- a/test/automl/test_score.py
+++ b/test/automl/test_score.py
@ -0,0 +1,218 @@
+from flaml import AutoML
+import pandas as pd
+from sklearn.datasets import fetch_california_housing, fetch_openml
+
+
+class TestScore:
+    def test_forecast(self, budget=5):
+        import pickle
+
+        # using dataframe
+        import statsmodels.api as sm
+
+        data = sm.datasets.co2.load_pandas().data["co2"].resample("MS").mean()
+        data = (
+            data.fillna(data.bfill())
+            .to_frame()
+            .reset_index()
+            .rename(columns={"index": "ds", "co2": "y"})
+        )
+        num_samples = data.shape[0]
+        time_horizon = 12
+        split_idx = num_samples - time_horizon
+        X_test = data[split_idx:]["ds"]
+        y_test = data[split_idx:]["y"]
+
+        df = data[:split_idx]
+        automl = AutoML()
+        settings = {
+            "time_budget": budget,  # total running time in seconds
+            "metric": "mape",  # primary metric
+            "task": "ts_forecast",  # task type
+            "log_file_name": "test/CO2_forecast.log",  # flaml log file
+            "eval_method": "holdout",
+            "label": "y",
+        }
+        """The main flaml automl API"""
+        try:
+            import prophet
+
+            automl.fit(
+                dataframe=df,
+                estimator_list=["prophet", "arima", "sarimax"],
+                **settings,
+                period=time_horizon,
+            )
+            automl.score(X_test, y_test)
+            automl.pickle("automl.pkl")
+            with open("automl.pkl", "rb") as f:
+                pickle.load(f)
+        except ImportError:
+            print("not using prophet due to ImportError")
+            automl.fit(
+                dataframe=df,
+                **settings,
+                estimator_list=["arima", "sarimax"],
+                period=time_horizon,
+            )
+            automl.score(X_test, y_test)
+            automl.pickle("automl.pkl")
+            with open("automl.pkl", "rb") as f:
+                pickle.load(f)
+
+    def test_classification(self):
+        X = pd.DataFrame(
+            {
+                "f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
+                "f2": [
+                    3.0,
+                    16.0,
+                    10.0,
+                    12.0,
+                    3.0,
+                    14.0,
+                    11.0,
+                    12.0,
+                    5.0,
+                    14.0,
+                    20.0,
+                    16.0,
+                    15.0,
+                    11.0,
+                ],
+                "f3": [
+                    "a",
+                    "b",
+                    "a",
+                    "c",
+                    "c",
+                    "b",
+                    "b",
+                    "b",
+                    "b",
+                    "a",
+                    "b",
+                    1.0,
+                    1.0,
+                    "a",
+                ],
+                "f4": [
+                    True,
+                    True,
+                    False,
+                    True,
+                    True,
+                    False,
+                    False,
+                    False,
+                    True,
+                    True,
+                    False,
+                    False,
+                    True,
+                    True,
+                ],
+            }
+        )
+        y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
+
+        automl = AutoML()
+        for each_estimator in [
+            "catboost",
+            "lrl2",
+            "lrl1",
+            "rf",
+            "lgbm",
+            "extra_tree",
+            "kneighbor",
+            "xgboost",
+        ]:
+            automl_settings = {
+                "time_budget": 6,
+                "task": "classification",
+                "n_jobs": 1,
+                "estimator_list": [each_estimator],
+                "metric": "accuracy",
+                "log_training_metric": True,
+            }
+        automl.score(X, y)  # for covering the case no estimator is trained
+
+        automl.fit(X, y, **automl_settings)
+        automl.score(X, y)
+        automl.score(X, y, **{"metric": "accuracy"})
+
+        automl.pickle("automl.pkl")
+
+    def test_regression(self):
+        automl_experiment = AutoML()
+
+        X_train, y_train = fetch_california_housing(return_X_y=True)
+        n = int(len(y_train) * 9 // 10)
+
+        for each_estimator in [
+            "lgbm",
+            "xgboost",
+            "rf",
+            "extra_tree",
+            "catboost",
+            "kneighbor",
+        ]:
+            automl_settings = {
+                "time_budget": 2,
+                "task": "regression",
+                "log_file_name": "test/california.log",
+                "log_training_metric": True,
+                "estimator_list": [each_estimator],
+                "n_jobs": 1,
+                "model_history": True,
+            }
+            automl_experiment.fit(
+                X_train=X_train[:n],
+                y_train=y_train[:n],
+                X_val=X_train[n:],
+                y_val=y_train[n:],
+                **automl_settings,
+            )
+
+            automl_experiment.score(X_train[n:], y_train[n:], **{"metric": "mse"})
+            automl_experiment.pickle("automl.pkl")
+
+    def test_rank(self):
+        from sklearn.externals._arff import ArffException
+
+        dataset = "credit-g"
+
+        try:
+            X, y = fetch_openml(name=dataset, return_X_y=True)
+            y = y.cat.codes
+        except (ArffException, ValueError):
+            from sklearn.datasets import load_wine
+
+            X, y = load_wine(return_X_y=True)
+
+        import numpy as np
+
+        automl = AutoML()
+        n = 500
+
+        for each_estimator in ["lgbm", "xgboost"]:
+            automl_settings = {
+                "time_budget": 2,
+                "task": "rank",
+                "log_file_name": "test/{}.log".format(dataset),
+                "model_history": True,
+                "groups": np.array([0] * 200 + [1] * 200 + [2] * 100),  # group labels
+                "learner_selector": "roundrobin",
+                "estimator_list": [each_estimator],
+            }
+            automl.fit(X[:n], y[:n], **automl_settings)
+            try:
+                automl.score(X[n:], y[n:])
+                automl.pickle("automl.pkl")
+            except NotImplementedError:
+                pass
+
+
+if __name__ == "__main__":
+    test = TestScore()
+    test.test_forecast()
--- a/test/nlp/test_autohf.py
+++ b/test/nlp/test_autohf.py
@ -102,6 +102,8 @@ def test_hf_data():
            y_val=y_val,
            **automl_settings
        )
+        automl.score(X_val, y_val, **{"metric": "accuracy"})
+        automl.pickle("automl.pkl")
    except requests.exceptions.HTTPError:
        return

@ -113,10 +115,6 @@ def test_hf_data():
        record_id=0,
        **automl_settings
    )
-    with open("automl.pkl", "wb") as f:
-        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
-    with open("automl.pkl", "rb") as f:
-        automl = pickle.load(f)
    automl.predict(X_test)
    automl.predict(["test test", "test test"])
    automl.predict(
@ -183,8 +181,6 @@ def _test_custom_data():
        ]
    )

-    import pickle
-
    automl.pickle("automl.pkl")

    with open("automl.pkl", "rb") as f:
--- a/test/nlp/test_autohf_custom_metric.py
+++ b/test/nlp/test_autohf_custom_metric.py
@ -19,7 +19,7 @@ def custom_metric(
    from flaml.model import TransformersEstimator

    if estimator._trainer is None:
-        trainer, _, _ = estimator._init_model_for_predict(X_test)
+        trainer, _ = estimator._init_model_for_predict()
        estimator._trainer = None
    else:
        trainer = estimator._trainer
@ -93,6 +93,14 @@ def test_custom_metric():

    # testing when max_iter=1 and do retrain only without hpo

+    try:
+        import ray
+
+        if not ray.is_initialized():
+            ray.init()
+    except ImportError:
+        return
+
    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 1,
@ -100,6 +108,7 @@ def test_custom_metric():
        "task": "seq-classification",
        "metric": custom_metric,
        "log_file_name": "seqclass.log",
+        "use_ray": {"local_dir": "data/outut/"},
    }

    automl_settings["hf_args"] = {
@ -126,6 +135,8 @@ def test_custom_metric():
    automl.fit(
        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
    )
+    automl.score(X_val, y_val, **{"metric": custom_metric})
+    automl.pickle("automl.pkl")

    del automl