adding evaluation (#495)

* adding automl.score

* fixing the metric name in train_with_config

* adding pickle after score

* fixing a bug in automl.pickle
This commit is contained in:
Xueqing Liu 2022-03-25 17:00:08 -04:00 committed by GitHub
parent 1d029436e7
commit 5f97532986
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 375 additions and 39 deletions

4
.gitignore vendored
View File

@ -156,3 +156,7 @@ automl.pkl
.idea/*
.DS_Store
test/nlp/testtmp.py
test/nlp/testtmpfl.py

View File

@ -246,11 +246,6 @@ class AutoMLState:
* sample_size
/ state.data_size[0]
)
# raise Exception("bbbbb", state.time_budget, budget)
if _is_nlp_task(state.task):
state.fit_kwargs["X_val"] = state.X_val
state.fit_kwargs["y_val"] = state.y_val
(
trained_estimator,
@ -344,7 +339,7 @@ class AutoMLState:
estimator_class=self.learner_classes.get(estimator),
budget=budget,
fit_kwargs=self.fit_kwargs,
eval_metric="train_time",
eval_metric=self.metric if hasattr(self, "metric") else "train_time",
)
if sampled_weight is not None:
@ -699,6 +694,16 @@ class AutoML(BaseEstimator):
"""Time taken to find best model in seconds."""
return self.__dict__.get("_time_taken_best_iter")
def score(self, X: pd.DataFrame, y: pd.Series, **kwargs):
estimator = getattr(self, "_trained_estimator", None)
if estimator is None:
logger.warning(
"No estimator is trained. Please run fit with enough budget."
)
return None
X = self._preprocess(X)
return estimator.score(X, y, **kwargs)
def predict(
self,
X: Union[np.array, pd.DataFrame, List[str], List[List[str]]],
@ -1259,7 +1264,7 @@ class AutoML(BaseEstimator):
record_id: An integer of the record ID in the file,
0 corresponds to the first trial.
task: A string of the task type,
'binary', 'multi', 'regression', 'ts_forecast', 'rank'.
'binary', 'multiclass', 'regression', 'ts_forecast', 'rank'.
Returns:
An estimator object for the given configuration.
@ -1645,8 +1650,12 @@ class AutoML(BaseEstimator):
estimator_to_training_function = {}
for estimator in self.estimator_list:
search_state = self._search_states[estimator]
estimator_to_training_function[estimator] = search_state.training_function
del search_state.training_function
if hasattr(search_state, "training_function"):
estimator_to_training_function[
estimator
] = search_state.training_function
del search_state.training_function
with open(output_file_name, "wb") as f:
pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
@ -1781,7 +1790,7 @@ class AutoML(BaseEstimator):
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
'mape'. Default is 'auto'.
If passing a customized metric function, the function needs to
have the follwing signature:
have the following signature:
```python
def custom_metric(
X_test, y_test, estimator, labels,
@ -2114,7 +2123,7 @@ class AutoML(BaseEstimator):
metric = load_default_huggingface_metric_for_task(self._state.task)
elif "binary" in self._state.task:
metric = "roc_auc"
elif "multi" in self._state.task:
elif "multiclass" in self._state.task:
metric = "log_loss"
elif self._state.task in TS_FORECAST:
metric = "mape"
@ -2838,7 +2847,7 @@ class AutoML(BaseEstimator):
estimators = []
if self._ensemble and self._state.task in (
"binary",
"multi",
"multiclass",
"regression",
):
search_states = list(

View File

@ -18,7 +18,7 @@ MULTICHOICECLASSIFICATION = "multichoice-classification"
TOKENCLASSIFICATION = "token-classification"
CLASSIFICATION = (
"binary",
"multi",
"multiclass",
"classification",
SEQCLASSIFICATION,
MULTICHOICECLASSIFICATION,

View File

@ -25,7 +25,7 @@ def load_config_predictor(estimator_name, task, location=None):
predictor = CONFIG_PREDICTORS.get(key)
if predictor:
return predictor
task = "multiclass" if task == "multi" else task
task = "multiclass" if task == "multi" else task # TODO: multi -> multiclass?
try:
location = location or LOCATION
with open(f"{location}/{estimator_name}/{task}.json", "r") as f:

View File

@ -219,6 +219,13 @@ def is_in_sklearn_metric_name_set(metric_name):
return metric_name.startswith("ndcg") or metric_name in sklearn_metric_name_set
def is_min_metric(metric_name):
return (
metric_name in ["rmse", "mae", "mse", "log_loss", "mape"]
or huggingface_metric_to_mode.get(metric_name, None) == "min"
)
def sklearn_metric_loss_score(
metric_name,
y_predict,
@ -565,6 +572,8 @@ def compute_estimator(
if isinstance(estimator, TransformersEstimator):
fit_kwargs["metric"] = eval_metric
fit_kwargs["X_val"] = X_val
fit_kwargs["y_val"] = y_val
if "holdout" == eval_method:
val_loss, metric_for_logging, train_time, pred_time = get_val_loss(
@ -633,7 +642,7 @@ def get_classification_objective(num_labels: int) -> str:
if num_labels == 2:
objective_name = "binary"
else:
objective_name = "multi"
objective_name = "multiclass"
return objective_name

View File

@ -88,7 +88,9 @@ class BaseEstimator:
Args:
task: A string of the task type, one of
'binary', 'multi', 'regression', 'rank', 'forecast'.
'binary', 'multiclass', 'regression', 'rank', 'seq-classification',
'seq-regression', 'token-classification', 'multichoice-classification',
'summarization', 'ts_forecast', 'ts_forecast_classification'.
config: A dictionary containing the hyperparameter names, 'n_jobs' as keys.
n_jobs is the number of parallel threads.
"""
@ -234,6 +236,56 @@ class BaseEstimator:
X = self._preprocess(X)
return self._model.predict_proba(X)
def score(self, X_val: DataFrame, y_val: Series, **kwargs):
"""Report the evaluation score of a trained estimator.
Args:
X_val: A pandas dataframe of the validation input data.
y_val: A pandas series of the validation label.
kwargs: keyword argument of the evaluation function, for example:
- metric: A string of the metric name or a function
e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
'mape'. Default is 'auto'.
If metric is given, the score will report the user specified metric.
If metric is not given, the metric is set to accuracy for classification and r2
for regression.
You can also pass a customized metric function, for examples on how to pass a
customized metric function, please check
[test/nlp/test_autohf_custom_metric.py](https://github.com/microsoft/FLAML/blob/main/test/nlp/test_autohf_custom_metric.py) and
[test/automl/test_multiclass.py](https://github.com/microsoft/FLAML/blob/main/test/automl/test_multiclass.py).
```
Returns:
The evaluation score on the validation dataset.
"""
from .ml import metric_loss_score
from .ml import is_min_metric
if self._model is not None:
if self._task == "rank":
raise NotImplementedError(
"AutoML.score() is not implemented for ranking"
)
else:
X_val = self._preprocess(X_val)
metric = kwargs.get("metric", None)
if metric:
y_pred = self.predict(X_val, **kwargs)
if is_min_metric(metric):
return metric_loss_score(metric, y_pred, y_val)
else:
return 1.0 - metric_loss_score(metric, y_pred, y_val)
else:
return self._model.score(X_val, y_val, **kwargs)
else:
logger.warning(
"Estimator is not fit yet. Please run fit() before predict()."
)
return 0.0
def cleanup(self):
del self._model
self._model = None
@ -244,7 +296,7 @@ class BaseEstimator:
Args:
data_size: A tuple of two integers, number of rows and columns.
task: A str of the task type, e.g., "binary", "multi", "regression".
task: A str of the task type, e.g., "binary", "multiclass", "regression".
Returns:
A dictionary of the search space.
@ -518,7 +570,6 @@ class TransformersEstimator(BaseEstimator):
else self.hf_args.model_path,
self._task,
)
self._metric = kwargs["metric"]
try:
@ -720,15 +771,11 @@ class TransformersEstimator(BaseEstimator):
metric_dict["automl_metric"] = loss
return metric_dict
def _init_model_for_predict(self, X_test):
from datasets import Dataset
def _init_model_for_predict(self):
from .nlp.huggingface.trainer import TrainerForAuto
from .nlp.huggingface.data_collator import DataCollatorForPredict
from .nlp.utils import load_model
X_test, _ = self._preprocess(X_test, **self._kwargs)
test_dataset = Dataset.from_pandas(X_test)
this_model = load_model(
checkpoint_path=self._checkpoint_path,
task=self._task,
@ -750,25 +797,56 @@ class TransformersEstimator(BaseEstimator):
)
if self._task in NLG_TASKS:
setattr(new_trainer, "_is_seq2seq", True)
return new_trainer, test_dataset, training_args
return new_trainer, training_args
def predict_proba(self, X, **kwargs):
from datasets import Dataset
self._update_hf_args(kwargs)
assert (
self._task in CLASSIFICATION
), "predict_proba() only for classification tasks."
new_trainer, test_dataset, _ = self._init_model_for_predict(X)
X_test, _ = self._preprocess(X, **self._kwargs)
test_dataset = Dataset.from_pandas(X_test)
new_trainer, _ = self._init_model_for_predict()
predictions = new_trainer.predict(test_dataset)
return predictions.predictions
def score(self, X_val: DataFrame, y_val: Series, **kwargs):
import transformers
from datasets import Dataset
transformers.logging.set_verbosity_error()
self._metric = kwargs["metric"]
if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
self._X_val, _ = self._preprocess(X=X_val)
self._y_val = y_val
else:
self._X_val, self._y_val = self._preprocess(X=X_val, y=y_val)
eval_dataset = Dataset.from_pandas(
TransformersEstimator._join(self._X_val, self._y_val)
)
new_trainer, training_args = self._init_model_for_predict()
return new_trainer.evaluate(eval_dataset)
def predict(self, X, **kwargs):
import transformers
from datasets import Dataset
transformers.logging.set_verbosity_error()
self._update_hf_args(kwargs)
new_trainer, test_dataset, training_args = self._init_model_for_predict(X)
X_test, _ = self._preprocess(X, **self._kwargs)
test_dataset = Dataset.from_pandas(X_test)
new_trainer, training_args = self._init_model_for_predict()
if self._task not in NLG_TASKS:
predictions = new_trainer.predict(test_dataset)
@ -1677,6 +1755,17 @@ class Prophet(SKLearnEstimator):
)
return np.ones(X.shape[0])
def score(self, X_val: DataFrame, y_val: Series, **kwargs):
from sklearn.metrics import r2_score
from .ml import metric_loss_score
y_pred = self.predict(X_val)
self._metric = kwargs.get("metric", None)
if self._metric:
return metric_loss_score(self._metric, y_pred, y_val)
else:
return r2_score(y_pred, y_val)
class ARIMA(Prophet):
"""The class for tuning ARIMA."""

View File

@ -128,9 +128,9 @@
"Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.9/site-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel->jupyter->flaml[notebook]) (0.8.2)\n",
"Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.9/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->flaml[notebook]) (21.2.0)\n",
"Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.9/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->flaml[notebook]) (0.18.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n",
"\u001b[33mWARNING: You are using pip version 21.3; however, version 21.3.1 is available.\n",
"You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
"\u001B[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001B[0m\n",
"\u001B[33mWARNING: You are using pip version 21.3; however, version 21.3.1 is available.\n",
"You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001B[0m\n"
]
}
],
@ -863,7 +863,7 @@
" \n",
" Args:\n",
" task: A string of the task type, one of\n",
" 'binary', 'multi', 'regression'\n",
" 'binary', 'multiclass', 'regression'\n",
" config: A dictionary containing the hyperparameter names\n",
" and 'n_jobs' as keys. n_jobs is the number of parallel threads.\n",
" '''\n",
@ -1283,4 +1283,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@ -203,7 +203,7 @@ class TestMultiClass(unittest.TestCase):
print(automl_experiment.best_estimator)
automl_experiment = AutoML()
estimator = automl_experiment.get_estimator_from_log(
automl_settings["log_file_name"], record_id=0, task="multi"
automl_settings["log_file_name"], record_id=0, task="multiclass"
)
print(estimator)
(

218
test/automl/test_score.py Normal file
View File

@ -0,0 +1,218 @@
from flaml import AutoML
import pandas as pd
from sklearn.datasets import fetch_california_housing, fetch_openml
class TestScore:
def test_forecast(self, budget=5):
import pickle
# using dataframe
import statsmodels.api as sm
data = sm.datasets.co2.load_pandas().data["co2"].resample("MS").mean()
data = (
data.fillna(data.bfill())
.to_frame()
.reset_index()
.rename(columns={"index": "ds", "co2": "y"})
)
num_samples = data.shape[0]
time_horizon = 12
split_idx = num_samples - time_horizon
X_test = data[split_idx:]["ds"]
y_test = data[split_idx:]["y"]
df = data[:split_idx]
automl = AutoML()
settings = {
"time_budget": budget, # total running time in seconds
"metric": "mape", # primary metric
"task": "ts_forecast", # task type
"log_file_name": "test/CO2_forecast.log", # flaml log file
"eval_method": "holdout",
"label": "y",
}
"""The main flaml automl API"""
try:
import prophet
automl.fit(
dataframe=df,
estimator_list=["prophet", "arima", "sarimax"],
**settings,
period=time_horizon,
)
automl.score(X_test, y_test)
automl.pickle("automl.pkl")
with open("automl.pkl", "rb") as f:
pickle.load(f)
except ImportError:
print("not using prophet due to ImportError")
automl.fit(
dataframe=df,
**settings,
estimator_list=["arima", "sarimax"],
period=time_horizon,
)
automl.score(X_test, y_test)
automl.pickle("automl.pkl")
with open("automl.pkl", "rb") as f:
pickle.load(f)
def test_classification(self):
X = pd.DataFrame(
{
"f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
"f2": [
3.0,
16.0,
10.0,
12.0,
3.0,
14.0,
11.0,
12.0,
5.0,
14.0,
20.0,
16.0,
15.0,
11.0,
],
"f3": [
"a",
"b",
"a",
"c",
"c",
"b",
"b",
"b",
"b",
"a",
"b",
1.0,
1.0,
"a",
],
"f4": [
True,
True,
False,
True,
True,
False,
False,
False,
True,
True,
False,
False,
True,
True,
],
}
)
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
automl = AutoML()
for each_estimator in [
"catboost",
"lrl2",
"lrl1",
"rf",
"lgbm",
"extra_tree",
"kneighbor",
"xgboost",
]:
automl_settings = {
"time_budget": 6,
"task": "classification",
"n_jobs": 1,
"estimator_list": [each_estimator],
"metric": "accuracy",
"log_training_metric": True,
}
automl.score(X, y) # for covering the case no estimator is trained
automl.fit(X, y, **automl_settings)
automl.score(X, y)
automl.score(X, y, **{"metric": "accuracy"})
automl.pickle("automl.pkl")
def test_regression(self):
automl_experiment = AutoML()
X_train, y_train = fetch_california_housing(return_X_y=True)
n = int(len(y_train) * 9 // 10)
for each_estimator in [
"lgbm",
"xgboost",
"rf",
"extra_tree",
"catboost",
"kneighbor",
]:
automl_settings = {
"time_budget": 2,
"task": "regression",
"log_file_name": "test/california.log",
"log_training_metric": True,
"estimator_list": [each_estimator],
"n_jobs": 1,
"model_history": True,
}
automl_experiment.fit(
X_train=X_train[:n],
y_train=y_train[:n],
X_val=X_train[n:],
y_val=y_train[n:],
**automl_settings,
)
automl_experiment.score(X_train[n:], y_train[n:], **{"metric": "mse"})
automl_experiment.pickle("automl.pkl")
def test_rank(self):
from sklearn.externals._arff import ArffException
dataset = "credit-g"
try:
X, y = fetch_openml(name=dataset, return_X_y=True)
y = y.cat.codes
except (ArffException, ValueError):
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
import numpy as np
automl = AutoML()
n = 500
for each_estimator in ["lgbm", "xgboost"]:
automl_settings = {
"time_budget": 2,
"task": "rank",
"log_file_name": "test/{}.log".format(dataset),
"model_history": True,
"groups": np.array([0] * 200 + [1] * 200 + [2] * 100), # group labels
"learner_selector": "roundrobin",
"estimator_list": [each_estimator],
}
automl.fit(X[:n], y[:n], **automl_settings)
try:
automl.score(X[n:], y[n:])
automl.pickle("automl.pkl")
except NotImplementedError:
pass
if __name__ == "__main__":
test = TestScore()
test.test_forecast()

View File

@ -102,6 +102,8 @@ def test_hf_data():
y_val=y_val,
**automl_settings
)
automl.score(X_val, y_val, **{"metric": "accuracy"})
automl.pickle("automl.pkl")
except requests.exceptions.HTTPError:
return
@ -113,10 +115,6 @@ def test_hf_data():
record_id=0,
**automl_settings
)
with open("automl.pkl", "wb") as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
with open("automl.pkl", "rb") as f:
automl = pickle.load(f)
automl.predict(X_test)
automl.predict(["test test", "test test"])
automl.predict(
@ -183,8 +181,6 @@ def _test_custom_data():
]
)
import pickle
automl.pickle("automl.pkl")
with open("automl.pkl", "rb") as f:

View File

@ -19,7 +19,7 @@ def custom_metric(
from flaml.model import TransformersEstimator
if estimator._trainer is None:
trainer, _, _ = estimator._init_model_for_predict(X_test)
trainer, _ = estimator._init_model_for_predict()
estimator._trainer = None
else:
trainer = estimator._trainer
@ -93,6 +93,14 @@ def test_custom_metric():
# testing when max_iter=1 and do retrain only without hpo
try:
import ray
if not ray.is_initialized():
ray.init()
except ImportError:
return
automl_settings = {
"gpu_per_trial": 0,
"max_iter": 1,
@ -100,6 +108,7 @@ def test_custom_metric():
"task": "seq-classification",
"metric": custom_metric,
"log_file_name": "seqclass.log",
"use_ray": {"local_dir": "data/outut/"},
}
automl_settings["hf_args"] = {
@ -126,6 +135,8 @@ def test_custom_metric():
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
)
automl.score(X_val, y_val, **{"metric": custom_metric})
automl.pickle("automl.pkl")
del automl