mirror of https://github.com/microsoft/autogen.git
parent
a5a5a4bc20
commit
183b867856
|
@ -172,7 +172,8 @@ Any code you commit should generally not significantly impact coverage. To run a
|
|||
```
|
||||
coverage run -m pytest test
|
||||
```
|
||||
|
||||
Then you can see the coverage report by
|
||||
`coverage report -m` or `coverage html`.
|
||||
If all the tests are passed, please also test run notebook/flaml_automl to make sure your commit does not break the notebook example.
|
||||
|
||||
## Authors
|
||||
|
|
|
@ -9,7 +9,7 @@ from functools import partial
|
|||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
|
||||
RepeatedKFold
|
||||
RepeatedKFold, GroupKFold
|
||||
from sklearn.utils import shuffle
|
||||
import pandas as pd
|
||||
import os
|
||||
|
@ -513,6 +513,10 @@ class AutoML:
|
|||
X_train_all, y_train_all,
|
||||
self._state.fit_kwargs['sample_weight'],
|
||||
random_state=RANDOM_SEED)
|
||||
elif hasattr(self._state, 'groups') and self._state.groups is not None:
|
||||
X_train_all, y_train_all, self._state.groups = shuffle(
|
||||
X_train_all, y_train_all, self._state.groups,
|
||||
random_state=RANDOM_SEED)
|
||||
else:
|
||||
X_train_all, y_train_all = shuffle(
|
||||
X_train_all, y_train_all, random_state=RANDOM_SEED)
|
||||
|
@ -523,7 +527,10 @@ class AutoML:
|
|||
|
||||
X_train, y_train = X_train_all, y_train_all
|
||||
if X_val is None:
|
||||
# if eval_method = holdout, make holdout data
|
||||
if self._state.task != 'regression' and eval_method == 'holdout':
|
||||
# for classification, make sure the labels are complete in both
|
||||
# training and validation data
|
||||
label_set, first = np.unique(y_train_all, return_index=True)
|
||||
rest = []
|
||||
last = 0
|
||||
|
@ -565,10 +572,6 @@ class AutoML:
|
|||
X_val = concat(X_first, X_val)
|
||||
y_val = concat(label_set, y_val) if self._df else \
|
||||
np.concatenate([label_set, y_val])
|
||||
_, y_train_counts_elements = np.unique(y_train,
|
||||
return_counts=True)
|
||||
_, y_val_counts_elements = np.unique(y_val,
|
||||
return_counts=True)
|
||||
elif eval_method == 'holdout' and self._state.task == 'regression':
|
||||
if 'sample_weight' in self._state.fit_kwargs:
|
||||
X_train, X_val, y_train, y_val, self._state.fit_kwargs[
|
||||
|
@ -592,7 +595,15 @@ class AutoML:
|
|||
self.data_size_full = self._state.data_size + X_val.shape[0]
|
||||
self._state.X_train, self._state.y_train, self._state.X_val, \
|
||||
self._state.y_val = (X_train, y_train, X_val, y_val)
|
||||
if self._split_type == "stratified":
|
||||
if hasattr(self._state, 'groups') and self._state.groups is not None:
|
||||
logger.info("Using GroupKFold")
|
||||
assert len(self._state.groups) == y_train_all.size, \
|
||||
"the length of groups must match the number of examples"
|
||||
assert len(np.unique(self._state.groups)) >= n_splits, \
|
||||
"the number of groups must be equal or larger than n_splits"
|
||||
self._state.kf = GroupKFold(n_splits)
|
||||
self._state.kf.groups = self._state.groups
|
||||
elif self._split_type == "stratified":
|
||||
logger.info("Using StratifiedKFold")
|
||||
assert y_train_all.size >= n_splits, (
|
||||
f"{n_splits}-fold cross validation"
|
||||
|
@ -791,11 +802,12 @@ class AutoML:
|
|||
X_val=None,
|
||||
y_val=None,
|
||||
sample_weight_val=None,
|
||||
groups=None,
|
||||
verbose=1,
|
||||
retrain_full=True,
|
||||
split_type="stratified",
|
||||
learner_selector='sample',
|
||||
hpo_method=None,
|
||||
verbose=1,
|
||||
**fit_kwargs):
|
||||
'''Find a model for a given task
|
||||
|
||||
|
@ -853,10 +865,12 @@ class AutoML:
|
|||
log_training_metric: A boolean of whether to log the training
|
||||
metric for each model.
|
||||
mem_thres: A float of the memory size constraint in bytes
|
||||
X_val: None | a numpy array or a pandas dataframe of validation data
|
||||
y_val: None | a numpy array or a pandas series of validation labels
|
||||
sample_weight_val: None | a numpy array of the sample weight of
|
||||
X_val: None or a numpy array or a pandas dataframe of validation data
|
||||
y_val: None or a numpy array or a pandas series of validation labels
|
||||
sample_weight_val: None or a numpy array of the sample weight of
|
||||
validation data
|
||||
groups: None or an array-like of shape (n,) | Group labels for the
|
||||
samples used while splitting the dataset into train/valid set
|
||||
verbose: int, default=1 | Controls the verbosity, higher means more
|
||||
messages
|
||||
**fit_kwargs: Other key word arguments to pass to fit() function of
|
||||
|
@ -867,6 +881,7 @@ class AutoML:
|
|||
self._state.log_training_metric = log_training_metric
|
||||
self._state.fit_kwargs = fit_kwargs
|
||||
self._state.weight_val = sample_weight_val
|
||||
self._state.groups = groups
|
||||
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
|
||||
self._search_states = {} # key: estimator name; value: SearchState
|
||||
self._random = np.random.RandomState(RANDOM_SEED)
|
||||
|
|
|
@ -9,7 +9,7 @@ import pandas as pd
|
|||
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
|
||||
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
|
||||
f1_score
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold
|
||||
from .model import (
|
||||
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
|
||||
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
|
||||
|
@ -194,6 +194,8 @@ def evaluate_model_CV(
|
|||
|
||||
if isinstance(kf, RepeatedStratifiedKFold):
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
elif isinstance(kf, GroupKFold):
|
||||
kf = kf.split(X_train_split, y_train_split, kf.groups)
|
||||
else:
|
||||
kf = kf.split(X_train_split)
|
||||
rng = np.random.RandomState(2020)
|
||||
|
|
|
@ -1 +1 @@
|
|||
__version__ = "0.5.2"
|
||||
__version__ = "0.5.3"
|
||||
|
|
|
@ -6,7 +6,7 @@ from sklearn.model_selection import train_test_split
|
|||
from sklearn.metrics import accuracy_score
|
||||
|
||||
|
||||
dataset = "Airlines"
|
||||
dataset = "credit"
|
||||
|
||||
|
||||
def _test(split_type):
|
||||
|
@ -37,5 +37,26 @@ def _test_uniform():
|
|||
_test(split_type="uniform")
|
||||
|
||||
|
||||
def test_groups():
|
||||
from sklearn.externals._arff import ArffException
|
||||
try:
|
||||
X, y = fetch_openml(name=dataset, return_X_y=True)
|
||||
except (ArffException, ValueError):
|
||||
from sklearn.datasets import load_wine
|
||||
X, y = load_wine(return_X_y=True)
|
||||
|
||||
import numpy as np
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"task": 'classification',
|
||||
"log_file_name": "test/{}.log".format(dataset),
|
||||
"model_history": True,
|
||||
"eval_method": "cv",
|
||||
"groups": np.random.randint(low=0, high=10, size=len(y)),
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
Loading…
Reference in New Issue