diff --git a/flaml/automl.py b/flaml/automl.py index 1ec2cdcd29..c1d2ac78ab 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -698,6 +698,10 @@ class AutoML(BaseEstimator): return attr.classes_.tolist() return None + @property + def n_features_in_(self): + return self._trained_estimator.n_features_in_ + @property def time_to_find_best_model(self) -> float: """Time taken to find best model in seconds.""" @@ -2160,7 +2164,6 @@ class AutoML(BaseEstimator): del self._state.y_train, self._state.y_train_all, self._state.y_val del self._sample_weight_full, self._state.fit_kwargs del self._state.groups, self._state.groups_all, self._state.groups_val - # if verbose == 0: logger.setLevel(old_level) def _search_parallel(self): @@ -2247,7 +2250,7 @@ class AutoML(BaseEstimator): trial for trial in analysis.trials if trial.last_result - and trial.last_result["wall_clock_time"] is not None + and trial.last_result.get("wall_clock_time") is not None ), key=lambda x: x.last_result["wall_clock_time"], ) @@ -2259,8 +2262,9 @@ class AutoML(BaseEstimator): estimator = config.get("ml", config)["learner"] search_state = self._search_states[estimator] search_state.update(result, 0) - if result["wall_clock_time"] is not None: - self._state.time_from_start = result["wall_clock_time"] + wall_time = result.get("wall_clock_time") + if wall_time is not None: + self._state.time_from_start = wall_time if search_state.sample_size == self._state.data_size[0]: self._iter_per_learner[estimator] += 1 if not self._fullsize_reached: @@ -2278,17 +2282,34 @@ class AutoML(BaseEstimator): self._time_taken_best_iter = self._state.time_from_start better = True self._search_states[estimator].best_config = config - if (better or self._log_type == "all") and self._training_log: - self._training_log.append( - self._iter_per_learner[estimator], - search_state.metric_for_logging, - search_state.trial_time, - self._state.time_from_start, - search_state.val_loss, - config, - estimator, - search_state.sample_size, - ) + if better or self._log_type == "all": + self._log_trial(search_state, estimator) + + def _log_trial(self, search_state, estimator): + if self._training_log: + self._training_log.append( + self._iter_per_learner[estimator], + search_state.metric_for_logging, + search_state.trial_time, + self._state.time_from_start, + search_state.val_loss, + search_state.config, + estimator, + search_state.sample_size, + ) + if mlflow is not None and mlflow.active_run(): + with mlflow.start_run(nested=True): + mlflow.log_metric("iter_counter", self._iter_per_learner[estimator]) + mlflow.log_param("metric_for_logging", search_state.metric_for_logging) + mlflow.log_metric("trial_time", search_state.trial_time) + mlflow.log_metric("wall_clock_time", self._state.time_from_start) + mlflow.log_metric("validation_loss", search_state.val_loss) + mlflow.log_param("config", search_state.config) + mlflow.log_param("learner", estimator) + mlflow.log_param("sample_size", search_state.sample_size) + mlflow.log_metric("best_validation_loss", search_state.best_loss) + mlflow.log_param("best_config", search_state.best_config) + mlflow.log_param("best_learner", self._best_estimator) def _search_sequential(self): try: @@ -2461,8 +2482,9 @@ class AutoML(BaseEstimator): f"Estimated sufficient time budget={max_budget:.0f}s." f" Estimated necessary time budget={min_budget:.0f}s." ) - if result["wall_clock_time"] is not None: - self._state.time_from_start = result["wall_clock_time"] + wall_time = result.get("wall_clock_time") + if wall_time is not None: + self._state.time_from_start = wall_time # logger.info(f"{self._search_states[estimator].sample_size}, {data_size}") if search_state.sample_size == self._state.data_size[0]: self._iter_per_learner[estimator] += 1 @@ -2500,38 +2522,8 @@ class AutoML(BaseEstimator): ): search_state.trained_estimator.cleanup() if better or self._log_type == "all": - if self._training_log: - self._training_log.append( - self._iter_per_learner[estimator], - search_state.metric_for_logging, - search_state.trial_time, - self._state.time_from_start, - search_state.val_loss, - search_state.config, - estimator, - search_state.sample_size, - ) - if mlflow is not None and mlflow.active_run(): - with mlflow.start_run(nested=True): - mlflow.log_metric( - "iter_counter", self._iter_per_learner[estimator] - ) - mlflow.log_param( - "metric_for_logging", search_state.metric_for_logging - ) - mlflow.log_metric("trial_time", search_state.trial_time) - mlflow.log_metric( - "wall_clock_time", self._state.time_from_start - ) - mlflow.log_metric("validation_loss", search_state.val_loss) - mlflow.log_param("config", search_state.config) - mlflow.log_param("learner", estimator) - mlflow.log_param("sample_size", search_state.sample_size) - mlflow.log_metric( - "best_validation_loss", search_state.best_loss - ) - mlflow.log_param("best_config", search_state.best_config) - mlflow.log_param("best_learner", self._best_estimator) + self._log_trial(search_state, estimator) + logger.info( " at {:.1f}s,\testimator {}'s best error={:.4f},\tbest estimator {}'s best error={:.4f}".format( self._state.time_from_start, @@ -2640,6 +2632,7 @@ class AutoML(BaseEstimator): ) if self._trained_estimator: logger.info(f"selected model: {self._trained_estimator.model}") + estimators = [] if self._ensemble and self._state.task in ( "binary", "multi", @@ -2673,8 +2666,7 @@ class AutoML(BaseEstimator): if x[1].best_loss < 4 * self._selected.best_loss ] logger.info(estimators) - if len(estimators) <= 1: - return + if len(estimators) > 1: if self._state.task in CLASSIFICATION: from sklearn.ensemble import StackingClassifier as Stacker else: @@ -2732,6 +2724,7 @@ class AutoML(BaseEstimator): if ( self._state.task == TS_FORECAST or self._trained_estimator is None + or self._trained_estimator.model is None or ( self._state.time_budget - self._state.time_from_start > self._selected.est_retrain_time(self.data_size_full) @@ -2758,8 +2751,6 @@ class AutoML(BaseEstimator): logger.info(f"retrained model: {self._trained_estimator.model}") else: logger.info("not retraining because the time budget is too small.") - if self.model and mlflow is not None and mlflow.active_run(): - mlflow.sklearn.log_model(self.model, "best_model") def __del__(self): if ( diff --git a/flaml/model.py b/flaml/model.py index 37dfbab924..42d74bc068 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -116,7 +116,7 @@ class BaseEstimator: @property def n_features_in_(self): - return self.model.n_features_in_ + return self._model.n_features_in_ @property def model(self): diff --git a/flaml/version.py b/flaml/version.py index a2fecb4576..c5981731c5 100644 --- a/flaml/version.py +++ b/flaml/version.py @@ -1 +1 @@ -__version__ = "0.9.2" +__version__ = "0.9.3" diff --git a/notebook/integrate_azureml.ipynb b/notebook/integrate_azureml.ipynb index ec72fd9cb1..f66e96de18 100644 --- a/notebook/integrate_azureml.ipynb +++ b/notebook/integrate_azureml.ipynb @@ -2,6 +2,11 @@ "cells": [ { "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, "source": [ "Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. \n", "\n", @@ -22,105 +27,106 @@ "\n", "In this notebook, we use one real data example (binary classification) to showcase how to use FLAML library together with AzureML.\n", "\n", - "FLAML requires `Python>=3.6`. To run this notebook example, please install flaml with the `notebook` and `azureml` option:\n", + "FLAML requires `Python>=3.6`. To run this notebook example, please install flaml with the [azureml] option:\n", "```bash\n", - "pip install flaml[notebook,azureml]\n", + "pip install flaml[azureml]\n", "```" - ], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "!pip install flaml[notebook,azureml]" - ], + "metadata": {}, "outputs": [], - "metadata": {} + "source": [ + "!pip install flaml[azureml]" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Enable mlflow in AzureML workspace" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import mlflow\n", "from azureml.core import Workspace\n", "\n", "ws = Workspace.from_config()\n", "mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, "source": [ "## 2. Classification Example\n", "### Load data and preprocess\n", "\n", "Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure." - ], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "from flaml.data import load_openml_dataset\n", - "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')" - ], - "outputs": [], "metadata": { "slideshow": { "slide_type": "subslide" }, "tags": [] - } + }, + "outputs": [], + "source": [ + "from flaml.data import load_openml_dataset\n", + "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')" + ] }, { "cell_type": "markdown", - "source": [ - "### Run FLAML\n", - "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default ML learners of FLAML are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. " - ], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "source": [ + "### Run FLAML\n", + "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default ML learners of FLAML are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. " + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], "source": [ "''' import AutoML class from flaml package '''\n", "from flaml import AutoML\n", "automl = AutoML()" - ], - "outputs": [], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], "source": [ "settings = {\n", " \"time_budget\": 60, # total running time in seconds\n", @@ -131,181 +137,77 @@ " \"sample\": False, # whether to subsample training data\n", " \"log_file_name\": 'airlines_experiment.log', # flaml log file\n", "}" - ], - "outputs": [], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, + "outputs": [], "source": [ - "mlflow.set_experiment(\"flaml\")\n", + "experiment = mlflow.set_experiment(\"flaml\")\n", "with mlflow.start_run() as run:\n", - " '''The main flaml automl API'''\n", - " automl.fit(X_train=X_train, y_train=y_train, **settings)" - ], - "outputs": [], - "metadata": { - "slideshow": { - "slide_type": "slide" - }, - "tags": [] - } + " automl.fit(X_train=X_train, y_train=y_train, **settings)\n", + " # log the model\n", + " mlflow.sklearn.log_model(automl, \"automl\")\n" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ - "### Best model and metric" - ], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + "### Load the model" + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "''' retrieve best config and best learner'''\n", - "print('Best ML leaner:', automl.best_estimator)\n", - "print('Best hyperparmeter config:', automl.best_config)\n", - "print('Best accuracy on validation data: {0:.4g}'.format(1 - automl.best_loss))\n", - "print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))" - ], + "metadata": {}, "outputs": [], - "metadata": { - "slideshow": { - "slide_type": "slide" - }, - "tags": [] - } - }, - { - "cell_type": "code", - "execution_count": null, "source": [ - "automl.model" - ], - "outputs": [], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "''' pickle and save the automl object '''\n", - "import pickle\n", - "with open('automl.pkl', 'wb') as f:\n", - " pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)" - ], - "outputs": [], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "''' compute predictions of testing dataset ''' \n", - "y_pred = automl.predict(X_test)\n", - "print('Predicted labels', y_pred)\n", - "print('True labels', y_test)\n", - "y_pred_proba = automl.predict_proba(X_test)[:,1]" - ], - "outputs": [], - "metadata": { - "slideshow": { - "slide_type": "slide" - }, - "tags": [] - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "''' compute different metric values on testing dataset'''\n", - "from flaml.ml import sklearn_metric_loss_score\n", - "print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))\n", - "print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))\n", - "print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))" - ], - "outputs": [], - "metadata": { - "slideshow": { - "slide_type": "slide" - }, - "tags": [] - } + "automl = mlflow.sklearn.load_model(f\"{run.info.artifact_uri}/automl\")\n", + "print(automl.predict_proba(X_test))\n", + "print(automl.predict(X_test))" + ] }, { "cell_type": "markdown", - "source": [ - "### Log history" - ], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "source": [ + "### Retrieve logs" + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "from flaml.data import get_output_from_log\n", - "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", - " get_output_from_log(filename = settings['log_file_name'], time_budget = 60)\n", - "\n", - "for config in config_history:\n", - " print(config)" - ], - "outputs": [], "metadata": { "slideshow": { "slide_type": "subslide" }, "tags": [] - } - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "\n", - "plt.title('Learning Curve')\n", - "plt.xlabel('Wall Clock Time (s)')\n", - "plt.ylabel('Validation Accuracy')\n", - "plt.scatter(time_history, 1 - np.array(valid_loss_history))\n", - "plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\n", - "plt.show()" - ], + }, "outputs": [], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + "source": [ + "mlflow.search_runs(experiment_ids=[experiment.experiment_id], filter_string=\"params.learner = 'xgboost'\")" + ] } ], "metadata": { + "interpreter": { + "hash": "0cfea3304185a9579d09e0953576b57c8581e46e6ebc6dfeb681bc5a511f7544" + }, "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.0 64-bit ('blend': conda)" + "display_name": "Python 3.8.0 64-bit ('blend': conda)", + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -317,12 +219,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.0" - }, - "interpreter": { - "hash": "0cfea3304185a9579d09e0953576b57c8581e46e6ebc6dfeb681bc5a511f7544" + "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/test/automl/test_classification.py b/test/automl/test_classification.py index 66d8cdfd2a..1458fad2d7 100644 --- a/test/automl/test_classification.py +++ b/test/automl/test_classification.py @@ -99,6 +99,7 @@ class TestClassification(unittest.TestCase): "ensemble": True, } automl.fit(X, y, **automl_settings) + assert automl.model is not None automl = AutoML() automl_settings = { diff --git a/test/automl/test_notebook_example.py b/test/automl/test_notebook_example.py index 33c9c15ce7..79b6529f98 100644 --- a/test/automl/test_notebook_example.py +++ b/test/automl/test_notebook_example.py @@ -101,13 +101,25 @@ def test_mlflow(): "log_file_name": "adult.log", # flaml log file } mlflow.set_experiment("flaml") - with mlflow.start_run(): - """The main flaml automl API""" + with mlflow.start_run() as run: automl.fit(X_train=X_train, y_train=y_train, **settings) - # subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "mlflow"]) + mlflow.sklearn.log_model(automl, "automl") + loaded_model = mlflow.pyfunc.load_model(f"{run.info.artifact_uri}/automl") + print(loaded_model.predict(X_test)) automl._mem_thres = 0 print(automl.trainable(automl.points_to_evaluate[0])) + settings["use_ray"] = True + try: + with mlflow.start_run() as run: + automl.fit(X_train=X_train, y_train=y_train, **settings) + mlflow.sklearn.log_model(automl, "automl") + automl = mlflow.sklearn.load_model(f"{run.info.artifact_uri}/automl") + print(automl.predict_proba(X_test)) + except ImportError: + pass + # subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "mlflow"]) + if __name__ == "__main__": test_automl(120) diff --git a/website/docs/Examples/Integrate - AzureML.md b/website/docs/Examples/Integrate - AzureML.md index 20cc9fed9d..3aa6551be8 100644 --- a/website/docs/Examples/Integrate - AzureML.md +++ b/website/docs/Examples/Integrate - AzureML.md @@ -28,12 +28,11 @@ mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri()) ```python from flaml.data import load_openml_dataset +from flaml import AutoML # Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure. X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir="./") -from flaml import AutoML - automl = AutoML() settings = { "time_budget": 60, # total running time in seconds @@ -41,12 +40,23 @@ settings = { "task": "classification", # task type "log_file_name": "airlines_experiment.log", # flaml log file } -mlflow.set_experiment("flaml") # the experiment name in AzureML workspace +experiment = mlflow.set_experiment("flaml") # the experiment name in AzureML workspace with mlflow.start_run() as run: # create a mlflow run automl.fit(X_train=X_train, y_train=y_train, **settings) + mlflow.sklearn.log_model(automl, "automl") ``` -The metrics in the run will be automatically logged in an experiment named "flaml" in your AzureML workspace. +The metrics in the run will be automatically logged in an experiment named "flaml" in your AzureML workspace. They can be retrieved by `mlflow.search_runs`: + +```python +mlflow.search_runs(experiment_ids=[experiment.experiment_id], filter_string="params.learner = 'xgboost'") +``` + +The logged model can be loaded and used to make predictions: +```python +automl = mlflow.sklearn.load_model(f"{run.info.artifact_uri}/automl") +print(automl.predict(X_test)) +``` [Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb)