Improve annotations in automl and ml modules (#919)

* begin annotation in automl.py and ml.py * EstimatorSubclass + annotate metric * review: fixes + setting fit_kwargs as proper Optional * import from flaml.automl.model (import from flaml.model is deprecated) * comment n_jobs in train_estimator as well * better annotation in _compute_with_config_base Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu> --------- Co-authored-by: Andrea W <a.ruggerini@ammagamma.com> Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
2023-02-22 03:49:56 +01:00 · 2023-02-22 03:49:56 +01:00 · 8e447562c7
parent 6aa1d16ebc
commit 8e447562c7
2 changed files with 67 additions and 39 deletions
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@ -2,6 +2,7 @@
 #  * Copyright (c) FLAML authors. All rights reserved.
 #  * Licensed under the MIT License. See LICENSE file in the
 #  * project root for license information.
+from __future__ import annotations
 import time
 import os
 import sys
@ -306,7 +307,7 @@ class SearchState:


 class AutoMLState:
-    def _prepare_sample_train_data(self, sample_size):
+    def _prepare_sample_train_data(self, sample_size: int):
        sampled_weight = groups = None
        if sample_size <= self.data_size[0]:
            if isinstance(self.X_train, pd.DataFrame):
@ -344,7 +345,9 @@ class AutoMLState:
        return sampled_X_train, sampled_y_train, sampled_weight, groups

    @staticmethod
-    def _compute_with_config_base(config_w_resource, state, estimator, is_report=True):
+    def _compute_with_config_base(
+        config_w_resource: dict, state: AutoMLState, estimator: str, is_report: bool = True
+    ) -> dict:
        if "FLAML_sample_size" in config_w_resource:
            sample_size = int(config_w_resource["FLAML_sample_size"])
        else:
@ -435,9 +438,9 @@ class AutoMLState:

    def _train_with_config(
        self,
-        estimator,
-        config_w_resource,
-        sample_size=None,
+        estimator: str,
+        config_w_resource: dict,
+        sample_size: Optional[int] = None,
    ):
        if not sample_size:
            sample_size = config_w_resource.get(
@ -801,11 +804,11 @@ class AutoML(BaseEstimator):
            "classifier" if settings["task"] in CLASSIFICATION else "regressor"
        )

-    def get_params(self, deep=False):
+    def get_params(self, deep: bool = False) -> dict:
        return self._settings.copy()

    @property
-    def config_history(self):
+    def config_history(self) -> dict:
        """A dictionary of iter->(estimator, config, time),
        storing the best estimator, config, and the time when the best
        model is updated each time.
@ -819,7 +822,7 @@ class AutoML(BaseEstimator):
        """
        return self.__dict__.get("_trained_estimator")

-    def best_model_for_estimator(self, estimator_name):
+    def best_model_for_estimator(self, estimator_name: str):
        """Return the best model found for a particular estimator.

        Args:
@ -1587,7 +1590,7 @@ class AutoML(BaseEstimator):
        """
        self._state.learner_classes[learner_name] = learner_class

-    def get_estimator_from_log(self, log_file_name, record_id, task):
+    def get_estimator_from_log(self, log_file_name: str, record_id: int, task: str):
        """Get the estimator from log file.

        Args:
--- a/flaml/automl/ml.py
+++ b/flaml/automl/ml.py
@ -5,6 +5,8 @@
 import time
 import numpy as np
 import pandas as pd
+from typing import Union, Callable, TypeVar, Optional, Tuple
+
 from sklearn.metrics import (
    mean_squared_error,
    r2_score,
@ -46,9 +48,11 @@ from flaml.automl.model import (
    TransformersEstimatorModelSelection,
 )
 from flaml.automl.data import CLASSIFICATION, group_counts, TS_FORECAST
+from flaml.automl.model import BaseEstimator
 import logging

 logger = logging.getLogger(__name__)
+EstimatorSubclass = TypeVar("EstimatorSubclass", bound=BaseEstimator)

 sklearn_metric_name_set = {
    "r2",
@ -101,7 +105,12 @@ huggingface_metric_to_mode = {
 huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"}


-def get_estimator_class(task, estimator_name):
+def get_estimator_class(task: str, estimator_name: str) -> EstimatorSubclass:
+    """Given a task and an estimator name, return the relevant flaml-wrapped estimator class
+
+    NOTE: See why the return type is declarad by using TypeVar here on the mypy doc
+    https://mypy.readthedocs.io/en/stable/kinds_of_types.html#the-type-of-class-objects
+    """
    # when adding a new learner, need to add an elif branch
    if "xgboost" == estimator_name:
        estimator_class = XGBoost_TS if task in TS_FORECAST else XGBoostSklearnEstimator
@ -144,7 +153,7 @@ def get_estimator_class(task, estimator_name):


 def metric_loss_score(
-    metric_name,
+    metric_name: str,
    y_processed_predict,
    y_processed_true,
    labels=None,
@ -223,11 +232,11 @@ def metric_loss_score(
            return score


-def is_in_sklearn_metric_name_set(metric_name):
+def is_in_sklearn_metric_name_set(metric_name: str):
    return metric_name.startswith("ndcg") or metric_name in sklearn_metric_name_set


-def is_min_metric(metric_name):
+def is_min_metric(metric_name: str):
    return (
        metric_name in ["rmse", "mae", "mse", "log_loss", "mape"]
        or huggingface_metric_to_mode.get(metric_name, None) == "min"
@ -235,7 +244,7 @@ def is_min_metric(metric_name):


 def sklearn_metric_loss_score(
-    metric_name,
+    metric_name: str,
    y_predict,
    y_true,
    labels=None,
@ -372,7 +381,7 @@ def _eval_estimator(
    y_val,
    weight_val,
    groups_val,
-    eval_metric,
+    eval_metric: Union[str, Callable],
    obj,
    labels=None,
    log_training_metric=False,
@ -424,14 +433,14 @@ def _eval_estimator(

 def get_val_loss(
    config,
-    estimator,
+    estimator: EstimatorSubclass,
    X_train,
    y_train,
    X_val,
    y_val,
    weight_val,
    groups_val,
-    eval_metric,
+    eval_metric: Union[str, Callable],
    obj,
    labels=None,
    budget=None,
@ -487,13 +496,13 @@ def default_cv_score_agg_func(val_loss_folds, log_metrics_folds):


 def evaluate_model_CV(
-    config,
-    estimator,
+    config: dict,
+    estimator: EstimatorSubclass,
    X_train_all,
    y_train_all,
    budget,
    kf,
-    task,
+    task: str,
    eval_metric,
    best_val_loss,
    cv_score_agg_func=None,
@ -607,19 +616,24 @@ def compute_estimator(
    groups_val,
    budget,
    kf,
-    config_dic,
-    task,
-    estimator_name,
-    eval_method,
-    eval_metric,
+    config_dic: dict,
+    task: str,
+    estimator_name: str,
+    eval_method: str,
+    eval_metric: Union[str, Callable],
    best_val_loss=np.Inf,
-    n_jobs=1,
-    estimator_class=None,
-    cv_score_agg_func=None,
-    log_training_metric=False,
-    fit_kwargs={},
+    n_jobs: Optional[
+        int
+    ] = 1,  # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
+    estimator_class: Optional[EstimatorSubclass] = None,
+    cv_score_agg_func: Optional[callable] = None,
+    log_training_metric: Optional[bool] = False,
+    fit_kwargs: Optional[dict] = None,
    free_mem_ratio=0,
 ):
+    if not fit_kwargs:
+        fit_kwargs = {}
+
    estimator_class = estimator_class or get_estimator_class(task, estimator_name)
    estimator = estimator_class(
        **config_dic,
@ -677,18 +691,20 @@ def compute_estimator(


 def train_estimator(
-    config_dic,
+    config_dic: dict,
    X_train,
    y_train,
-    task,
-    estimator_name,
-    n_jobs=1,
-    estimator_class=None,
+    task: str,
+    estimator_name: str,
+    n_jobs: Optional[
+        int
+    ] = 1,  # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
+    estimator_class: Optional[EstimatorSubclass] = None,
    budget=None,
-    fit_kwargs={},
+    fit_kwargs: Optional[dict] = None,
    eval_metric=None,
    free_mem_ratio=0,
-):
+) -> Tuple[EstimatorSubclass, float]:
    start_time = time.time()
    estimator_class = estimator_class or get_estimator_class(task, estimator_name)
    estimator = estimator_class(
@ -696,6 +712,9 @@ def train_estimator(
        task=task,
        n_jobs=n_jobs,
    )
+    if not fit_kwargs:
+        fit_kwargs = {}
+
    if isinstance(estimator, TransformersEstimator):
        fit_kwargs["metric"] = eval_metric

@ -717,7 +736,9 @@ def get_classification_objective(num_labels: int) -> str:
    return objective_name


-def norm_confusion_matrix(y_true, y_pred):
+def norm_confusion_matrix(
+    y_true: Union[np.array, pd.Series], y_pred: Union[np.array, pd.Series]
+):
    """normalized confusion matrix.

    Args:
@ -735,7 +756,11 @@ def norm_confusion_matrix(y_true, y_pred):
    return norm_conf_mat


-def multi_class_curves(y_true, y_pred_proba, curve_func):
+def multi_class_curves(
+    y_true: Union[np.array, pd.Series],
+    y_pred_proba: Union[np.array, pd.Series],
+    curve_func: Callable,
+):
    """Binarize the data for multi-class tasks and produce ROC or precision-recall curves.

    Args: