mirror of https://github.com/microsoft/autogen.git
Improve annotations in automl and ml modules (#919)
* begin annotation in automl.py and ml.py * EstimatorSubclass + annotate metric * review: fixes + setting fit_kwargs as proper Optional * import from flaml.automl.model (import from flaml.model is deprecated) * comment n_jobs in train_estimator as well * better annotation in _compute_with_config_base Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu> --------- Co-authored-by: Andrea W <a.ruggerini@ammagamma.com> Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
This commit is contained in:
parent
6aa1d16ebc
commit
8e447562c7
|
@ -2,6 +2,7 @@
|
|||
# * Copyright (c) FLAML authors. All rights reserved.
|
||||
# * Licensed under the MIT License. See LICENSE file in the
|
||||
# * project root for license information.
|
||||
from __future__ import annotations
|
||||
import time
|
||||
import os
|
||||
import sys
|
||||
|
@ -306,7 +307,7 @@ class SearchState:
|
|||
|
||||
|
||||
class AutoMLState:
|
||||
def _prepare_sample_train_data(self, sample_size):
|
||||
def _prepare_sample_train_data(self, sample_size: int):
|
||||
sampled_weight = groups = None
|
||||
if sample_size <= self.data_size[0]:
|
||||
if isinstance(self.X_train, pd.DataFrame):
|
||||
|
@ -344,7 +345,9 @@ class AutoMLState:
|
|||
return sampled_X_train, sampled_y_train, sampled_weight, groups
|
||||
|
||||
@staticmethod
|
||||
def _compute_with_config_base(config_w_resource, state, estimator, is_report=True):
|
||||
def _compute_with_config_base(
|
||||
config_w_resource: dict, state: AutoMLState, estimator: str, is_report: bool = True
|
||||
) -> dict:
|
||||
if "FLAML_sample_size" in config_w_resource:
|
||||
sample_size = int(config_w_resource["FLAML_sample_size"])
|
||||
else:
|
||||
|
@ -435,9 +438,9 @@ class AutoMLState:
|
|||
|
||||
def _train_with_config(
|
||||
self,
|
||||
estimator,
|
||||
config_w_resource,
|
||||
sample_size=None,
|
||||
estimator: str,
|
||||
config_w_resource: dict,
|
||||
sample_size: Optional[int] = None,
|
||||
):
|
||||
if not sample_size:
|
||||
sample_size = config_w_resource.get(
|
||||
|
@ -801,11 +804,11 @@ class AutoML(BaseEstimator):
|
|||
"classifier" if settings["task"] in CLASSIFICATION else "regressor"
|
||||
)
|
||||
|
||||
def get_params(self, deep=False):
|
||||
def get_params(self, deep: bool = False) -> dict:
|
||||
return self._settings.copy()
|
||||
|
||||
@property
|
||||
def config_history(self):
|
||||
def config_history(self) -> dict:
|
||||
"""A dictionary of iter->(estimator, config, time),
|
||||
storing the best estimator, config, and the time when the best
|
||||
model is updated each time.
|
||||
|
@ -819,7 +822,7 @@ class AutoML(BaseEstimator):
|
|||
"""
|
||||
return self.__dict__.get("_trained_estimator")
|
||||
|
||||
def best_model_for_estimator(self, estimator_name):
|
||||
def best_model_for_estimator(self, estimator_name: str):
|
||||
"""Return the best model found for a particular estimator.
|
||||
|
||||
Args:
|
||||
|
@ -1587,7 +1590,7 @@ class AutoML(BaseEstimator):
|
|||
"""
|
||||
self._state.learner_classes[learner_name] = learner_class
|
||||
|
||||
def get_estimator_from_log(self, log_file_name, record_id, task):
|
||||
def get_estimator_from_log(self, log_file_name: str, record_id: int, task: str):
|
||||
"""Get the estimator from log file.
|
||||
|
||||
Args:
|
||||
|
|
|
@ -5,6 +5,8 @@
|
|||
import time
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from typing import Union, Callable, TypeVar, Optional, Tuple
|
||||
|
||||
from sklearn.metrics import (
|
||||
mean_squared_error,
|
||||
r2_score,
|
||||
|
@ -46,9 +48,11 @@ from flaml.automl.model import (
|
|||
TransformersEstimatorModelSelection,
|
||||
)
|
||||
from flaml.automl.data import CLASSIFICATION, group_counts, TS_FORECAST
|
||||
from flaml.automl.model import BaseEstimator
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
EstimatorSubclass = TypeVar("EstimatorSubclass", bound=BaseEstimator)
|
||||
|
||||
sklearn_metric_name_set = {
|
||||
"r2",
|
||||
|
@ -101,7 +105,12 @@ huggingface_metric_to_mode = {
|
|||
huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"}
|
||||
|
||||
|
||||
def get_estimator_class(task, estimator_name):
|
||||
def get_estimator_class(task: str, estimator_name: str) -> EstimatorSubclass:
|
||||
"""Given a task and an estimator name, return the relevant flaml-wrapped estimator class
|
||||
|
||||
NOTE: See why the return type is declarad by using TypeVar here on the mypy doc
|
||||
https://mypy.readthedocs.io/en/stable/kinds_of_types.html#the-type-of-class-objects
|
||||
"""
|
||||
# when adding a new learner, need to add an elif branch
|
||||
if "xgboost" == estimator_name:
|
||||
estimator_class = XGBoost_TS if task in TS_FORECAST else XGBoostSklearnEstimator
|
||||
|
@ -144,7 +153,7 @@ def get_estimator_class(task, estimator_name):
|
|||
|
||||
|
||||
def metric_loss_score(
|
||||
metric_name,
|
||||
metric_name: str,
|
||||
y_processed_predict,
|
||||
y_processed_true,
|
||||
labels=None,
|
||||
|
@ -223,11 +232,11 @@ def metric_loss_score(
|
|||
return score
|
||||
|
||||
|
||||
def is_in_sklearn_metric_name_set(metric_name):
|
||||
def is_in_sklearn_metric_name_set(metric_name: str):
|
||||
return metric_name.startswith("ndcg") or metric_name in sklearn_metric_name_set
|
||||
|
||||
|
||||
def is_min_metric(metric_name):
|
||||
def is_min_metric(metric_name: str):
|
||||
return (
|
||||
metric_name in ["rmse", "mae", "mse", "log_loss", "mape"]
|
||||
or huggingface_metric_to_mode.get(metric_name, None) == "min"
|
||||
|
@ -235,7 +244,7 @@ def is_min_metric(metric_name):
|
|||
|
||||
|
||||
def sklearn_metric_loss_score(
|
||||
metric_name,
|
||||
metric_name: str,
|
||||
y_predict,
|
||||
y_true,
|
||||
labels=None,
|
||||
|
@ -372,7 +381,7 @@ def _eval_estimator(
|
|||
y_val,
|
||||
weight_val,
|
||||
groups_val,
|
||||
eval_metric,
|
||||
eval_metric: Union[str, Callable],
|
||||
obj,
|
||||
labels=None,
|
||||
log_training_metric=False,
|
||||
|
@ -424,14 +433,14 @@ def _eval_estimator(
|
|||
|
||||
def get_val_loss(
|
||||
config,
|
||||
estimator,
|
||||
estimator: EstimatorSubclass,
|
||||
X_train,
|
||||
y_train,
|
||||
X_val,
|
||||
y_val,
|
||||
weight_val,
|
||||
groups_val,
|
||||
eval_metric,
|
||||
eval_metric: Union[str, Callable],
|
||||
obj,
|
||||
labels=None,
|
||||
budget=None,
|
||||
|
@ -487,13 +496,13 @@ def default_cv_score_agg_func(val_loss_folds, log_metrics_folds):
|
|||
|
||||
|
||||
def evaluate_model_CV(
|
||||
config,
|
||||
estimator,
|
||||
config: dict,
|
||||
estimator: EstimatorSubclass,
|
||||
X_train_all,
|
||||
y_train_all,
|
||||
budget,
|
||||
kf,
|
||||
task,
|
||||
task: str,
|
||||
eval_metric,
|
||||
best_val_loss,
|
||||
cv_score_agg_func=None,
|
||||
|
@ -607,19 +616,24 @@ def compute_estimator(
|
|||
groups_val,
|
||||
budget,
|
||||
kf,
|
||||
config_dic,
|
||||
task,
|
||||
estimator_name,
|
||||
eval_method,
|
||||
eval_metric,
|
||||
config_dic: dict,
|
||||
task: str,
|
||||
estimator_name: str,
|
||||
eval_method: str,
|
||||
eval_metric: Union[str, Callable],
|
||||
best_val_loss=np.Inf,
|
||||
n_jobs=1,
|
||||
estimator_class=None,
|
||||
cv_score_agg_func=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs={},
|
||||
n_jobs: Optional[
|
||||
int
|
||||
] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
|
||||
estimator_class: Optional[EstimatorSubclass] = None,
|
||||
cv_score_agg_func: Optional[callable] = None,
|
||||
log_training_metric: Optional[bool] = False,
|
||||
fit_kwargs: Optional[dict] = None,
|
||||
free_mem_ratio=0,
|
||||
):
|
||||
if not fit_kwargs:
|
||||
fit_kwargs = {}
|
||||
|
||||
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
|
||||
estimator = estimator_class(
|
||||
**config_dic,
|
||||
|
@ -677,18 +691,20 @@ def compute_estimator(
|
|||
|
||||
|
||||
def train_estimator(
|
||||
config_dic,
|
||||
config_dic: dict,
|
||||
X_train,
|
||||
y_train,
|
||||
task,
|
||||
estimator_name,
|
||||
n_jobs=1,
|
||||
estimator_class=None,
|
||||
task: str,
|
||||
estimator_name: str,
|
||||
n_jobs: Optional[
|
||||
int
|
||||
] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
|
||||
estimator_class: Optional[EstimatorSubclass] = None,
|
||||
budget=None,
|
||||
fit_kwargs={},
|
||||
fit_kwargs: Optional[dict] = None,
|
||||
eval_metric=None,
|
||||
free_mem_ratio=0,
|
||||
):
|
||||
) -> Tuple[EstimatorSubclass, float]:
|
||||
start_time = time.time()
|
||||
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
|
||||
estimator = estimator_class(
|
||||
|
@ -696,6 +712,9 @@ def train_estimator(
|
|||
task=task,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
if not fit_kwargs:
|
||||
fit_kwargs = {}
|
||||
|
||||
if isinstance(estimator, TransformersEstimator):
|
||||
fit_kwargs["metric"] = eval_metric
|
||||
|
||||
|
@ -717,7 +736,9 @@ def get_classification_objective(num_labels: int) -> str:
|
|||
return objective_name
|
||||
|
||||
|
||||
def norm_confusion_matrix(y_true, y_pred):
|
||||
def norm_confusion_matrix(
|
||||
y_true: Union[np.array, pd.Series], y_pred: Union[np.array, pd.Series]
|
||||
):
|
||||
"""normalized confusion matrix.
|
||||
|
||||
Args:
|
||||
|
@ -735,7 +756,11 @@ def norm_confusion_matrix(y_true, y_pred):
|
|||
return norm_conf_mat
|
||||
|
||||
|
||||
def multi_class_curves(y_true, y_pred_proba, curve_func):
|
||||
def multi_class_curves(
|
||||
y_true: Union[np.array, pd.Series],
|
||||
y_pred_proba: Union[np.array, pd.Series],
|
||||
curve_func: Callable,
|
||||
):
|
||||
"""Binarize the data for multi-class tasks and produce ROC or precision-recall curves.
|
||||
|
||||
Args:
|
||||
|
|
Loading…
Reference in New Issue