Improve annotations in automl and ml modules (#919)

* begin annotation in automl.py and ml.py

* EstimatorSubclass + annotate metric

* review: fixes + setting fit_kwargs as proper Optional

* import from flaml.automl.model (import from flaml.model is deprecated)

* comment n_jobs in train_estimator as well

* better annotation in _compute_with_config_base

Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>

---------

Co-authored-by: Andrea W <a.ruggerini@ammagamma.com>
Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
This commit is contained in:
Andrea Ruggerini 2023-02-22 03:49:56 +01:00 committed by GitHub
parent 6aa1d16ebc
commit 8e447562c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 67 additions and 39 deletions

View File

@ -2,6 +2,7 @@
# * Copyright (c) FLAML authors. All rights reserved.
# * Licensed under the MIT License. See LICENSE file in the
# * project root for license information.
from __future__ import annotations
import time
import os
import sys
@ -306,7 +307,7 @@ class SearchState:
class AutoMLState:
def _prepare_sample_train_data(self, sample_size):
def _prepare_sample_train_data(self, sample_size: int):
sampled_weight = groups = None
if sample_size <= self.data_size[0]:
if isinstance(self.X_train, pd.DataFrame):
@ -344,7 +345,9 @@ class AutoMLState:
return sampled_X_train, sampled_y_train, sampled_weight, groups
@staticmethod
def _compute_with_config_base(config_w_resource, state, estimator, is_report=True):
def _compute_with_config_base(
config_w_resource: dict, state: AutoMLState, estimator: str, is_report: bool = True
) -> dict:
if "FLAML_sample_size" in config_w_resource:
sample_size = int(config_w_resource["FLAML_sample_size"])
else:
@ -435,9 +438,9 @@ class AutoMLState:
def _train_with_config(
self,
estimator,
config_w_resource,
sample_size=None,
estimator: str,
config_w_resource: dict,
sample_size: Optional[int] = None,
):
if not sample_size:
sample_size = config_w_resource.get(
@ -801,11 +804,11 @@ class AutoML(BaseEstimator):
"classifier" if settings["task"] in CLASSIFICATION else "regressor"
)
def get_params(self, deep=False):
def get_params(self, deep: bool = False) -> dict:
return self._settings.copy()
@property
def config_history(self):
def config_history(self) -> dict:
"""A dictionary of iter->(estimator, config, time),
storing the best estimator, config, and the time when the best
model is updated each time.
@ -819,7 +822,7 @@ class AutoML(BaseEstimator):
"""
return self.__dict__.get("_trained_estimator")
def best_model_for_estimator(self, estimator_name):
def best_model_for_estimator(self, estimator_name: str):
"""Return the best model found for a particular estimator.
Args:
@ -1587,7 +1590,7 @@ class AutoML(BaseEstimator):
"""
self._state.learner_classes[learner_name] = learner_class
def get_estimator_from_log(self, log_file_name, record_id, task):
def get_estimator_from_log(self, log_file_name: str, record_id: int, task: str):
"""Get the estimator from log file.
Args:

View File

@ -5,6 +5,8 @@
import time
import numpy as np
import pandas as pd
from typing import Union, Callable, TypeVar, Optional, Tuple
from sklearn.metrics import (
mean_squared_error,
r2_score,
@ -46,9 +48,11 @@ from flaml.automl.model import (
TransformersEstimatorModelSelection,
)
from flaml.automl.data import CLASSIFICATION, group_counts, TS_FORECAST
from flaml.automl.model import BaseEstimator
import logging
logger = logging.getLogger(__name__)
EstimatorSubclass = TypeVar("EstimatorSubclass", bound=BaseEstimator)
sklearn_metric_name_set = {
"r2",
@ -101,7 +105,12 @@ huggingface_metric_to_mode = {
huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"}
def get_estimator_class(task, estimator_name):
def get_estimator_class(task: str, estimator_name: str) -> EstimatorSubclass:
"""Given a task and an estimator name, return the relevant flaml-wrapped estimator class
NOTE: See why the return type is declarad by using TypeVar here on the mypy doc
https://mypy.readthedocs.io/en/stable/kinds_of_types.html#the-type-of-class-objects
"""
# when adding a new learner, need to add an elif branch
if "xgboost" == estimator_name:
estimator_class = XGBoost_TS if task in TS_FORECAST else XGBoostSklearnEstimator
@ -144,7 +153,7 @@ def get_estimator_class(task, estimator_name):
def metric_loss_score(
metric_name,
metric_name: str,
y_processed_predict,
y_processed_true,
labels=None,
@ -223,11 +232,11 @@ def metric_loss_score(
return score
def is_in_sklearn_metric_name_set(metric_name):
def is_in_sklearn_metric_name_set(metric_name: str):
return metric_name.startswith("ndcg") or metric_name in sklearn_metric_name_set
def is_min_metric(metric_name):
def is_min_metric(metric_name: str):
return (
metric_name in ["rmse", "mae", "mse", "log_loss", "mape"]
or huggingface_metric_to_mode.get(metric_name, None) == "min"
@ -235,7 +244,7 @@ def is_min_metric(metric_name):
def sklearn_metric_loss_score(
metric_name,
metric_name: str,
y_predict,
y_true,
labels=None,
@ -372,7 +381,7 @@ def _eval_estimator(
y_val,
weight_val,
groups_val,
eval_metric,
eval_metric: Union[str, Callable],
obj,
labels=None,
log_training_metric=False,
@ -424,14 +433,14 @@ def _eval_estimator(
def get_val_loss(
config,
estimator,
estimator: EstimatorSubclass,
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
eval_metric: Union[str, Callable],
obj,
labels=None,
budget=None,
@ -487,13 +496,13 @@ def default_cv_score_agg_func(val_loss_folds, log_metrics_folds):
def evaluate_model_CV(
config,
estimator,
config: dict,
estimator: EstimatorSubclass,
X_train_all,
y_train_all,
budget,
kf,
task,
task: str,
eval_metric,
best_val_loss,
cv_score_agg_func=None,
@ -607,19 +616,24 @@ def compute_estimator(
groups_val,
budget,
kf,
config_dic,
task,
estimator_name,
eval_method,
eval_metric,
config_dic: dict,
task: str,
estimator_name: str,
eval_method: str,
eval_metric: Union[str, Callable],
best_val_loss=np.Inf,
n_jobs=1,
estimator_class=None,
cv_score_agg_func=None,
log_training_metric=False,
fit_kwargs={},
n_jobs: Optional[
int
] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
estimator_class: Optional[EstimatorSubclass] = None,
cv_score_agg_func: Optional[callable] = None,
log_training_metric: Optional[bool] = False,
fit_kwargs: Optional[dict] = None,
free_mem_ratio=0,
):
if not fit_kwargs:
fit_kwargs = {}
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
estimator = estimator_class(
**config_dic,
@ -677,18 +691,20 @@ def compute_estimator(
def train_estimator(
config_dic,
config_dic: dict,
X_train,
y_train,
task,
estimator_name,
n_jobs=1,
estimator_class=None,
task: str,
estimator_name: str,
n_jobs: Optional[
int
] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
estimator_class: Optional[EstimatorSubclass] = None,
budget=None,
fit_kwargs={},
fit_kwargs: Optional[dict] = None,
eval_metric=None,
free_mem_ratio=0,
):
) -> Tuple[EstimatorSubclass, float]:
start_time = time.time()
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
estimator = estimator_class(
@ -696,6 +712,9 @@ def train_estimator(
task=task,
n_jobs=n_jobs,
)
if not fit_kwargs:
fit_kwargs = {}
if isinstance(estimator, TransformersEstimator):
fit_kwargs["metric"] = eval_metric
@ -717,7 +736,9 @@ def get_classification_objective(num_labels: int) -> str:
return objective_name
def norm_confusion_matrix(y_true, y_pred):
def norm_confusion_matrix(
y_true: Union[np.array, pd.Series], y_pred: Union[np.array, pd.Series]
):
"""normalized confusion matrix.
Args:
@ -735,7 +756,11 @@ def norm_confusion_matrix(y_true, y_pred):
return norm_conf_mat
def multi_class_curves(y_true, y_pred_proba, curve_func):
def multi_class_curves(
y_true: Union[np.array, pd.Series],
y_pred_proba: Union[np.array, pd.Series],
curve_func: Callable,
):
"""Binarize the data for multi-class tasks and produce ROC or precision-recall curves.
Args: