* test distillbert

* import check

* complete partial config

* None check

* init config is not suggested by bo

* badge

* notebook for lightgbm
This commit is contained in:
Chi Wang 2021-02-22 22:10:41 -08:00 committed by GitHub
parent 2d3bd84038
commit 6ff0ed434b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 2048 additions and 1447 deletions

View File

@ -1,7 +1,7 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: Python package
name: Build
on:
push:

4
.gitignore vendored
View File

@ -146,6 +146,8 @@ dmypy.json
# Cython debug symbols
cython_debug/
/catboost_info
catboost_info
notebook/*.pkl
notebook/.azureml
mlruns

View File

@ -1,3 +1,8 @@
[![PyPI version](https://badge.fury.io/py/FLAML.svg)](https://badge.fury.io/py/FLAML)
[![Build](https://github.com/microsoft/FLAML/actions/workflows/python-package.yml/badge.svg)](https://github.com/microsoft/FLAML/actions/workflows/python-package.yml)
![Python Version](https://img.shields.io/badge/3.6%20%7C%203.7%20%7C%203.8-blue)
[![Downloads](https://pepy.tech/badge/flaml/month)](https://pepy.tech/project/flaml)
# FLAML - Fast and Lightweight AutoML
<p align="center">

View File

@ -1,5 +1,5 @@
from flaml.searcher import CFO, BlendSearch, FLOW2
from flaml.automl import AutoML
from flaml.automl import AutoML, logger_formatter
from flaml.version import __version__
import logging
@ -7,10 +7,3 @@ import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Add the console handler.
_ch = logging.StreamHandler()
logger_formatter = logging.Formatter(
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
'%m-%d %H:%M:%S')
_ch.setFormatter(logger_formatter)
logger.addHandler(_ch)

View File

@ -25,6 +25,10 @@ from .training_log import training_log_reader, training_log_writer
import logging
logger = logging.getLogger(__name__)
logger_formatter = logging.Formatter(
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
'%m-%d %H:%M:%S')
try:
import mlflow
except:
@ -326,6 +330,10 @@ class AutoML:
A numpy array of shape n * 1 - - each element is a predicted class
label for an instance.
'''
if self._trained_estimator is None:
warnings.warn(
"No estimator is trained. Please run fit with enough budget.")
return None
X_test = self._preprocess(X_test)
y_pred = self._trained_estimator.predict(X_test)
if y_pred.ndim > 1: y_pred = y_pred.flatten()
@ -837,6 +845,11 @@ class AutoML:
if eval_method == 'auto' or self._state.X_val is not None:
eval_method = self._decide_eval_method(time_budget)
self._state.eval_method = eval_method
if not mlflow or not mlflow.active_run() and not logger.handler:
# Add the console handler.
_ch = logging.StreamHandler()
_ch.setFormatter(logger_formatter)
logger.addHandler(_ch)
logger.info("Evaluation method: {}".format(eval_method))
self._retrain_full = retrain_full and (eval_method == 'holdout' and

View File

@ -113,8 +113,9 @@ class BlendSearch(Searcher):
self._deadline = config.get('time_budget_s') + time.time()
if 'metric_target' in config:
self._metric_target = config.get('metric_target')
else:
self._metric, self._mode = metric, mode
else:
if metric: self._metric = metric
if mode: self._mode = mode
self._ls.set_search_properties(metric, mode, config)
if self._gs is not None:
self._gs.set_search_properties(metric, mode, config)
@ -300,11 +301,9 @@ class BlendSearch(Searcher):
else: # use init config
init_config = self._points_to_evaluate.pop(
0) if self._points_to_evaluate else self._ls.init_config
if init_config==self._ls.init_config:
config = self._ls.complete_config(init_config,
config = self._ls.complete_config(init_config,
self._admissible_min, self._admissible_max)
# logger.info(f"reset config to {config}")
else: config = init_config
config_signature = self._ls.config_signature(config)
result = self._result.get(config_signature)
if result: # tried before
@ -314,7 +313,6 @@ class BlendSearch(Searcher):
self._result[config_signature] = {}
else: return None # running but no result yet
self._init_used = True
self._trial_proposed_by[trial_id] = 0
# logger.info(f"config={config}")
return config

View File

@ -190,6 +190,8 @@ class FLOW2(Searcher):
self._K = 0
self._iter_best_config = self.trial_count = 1
self._reset_times = 0
# record intermediate trial cost
self._trial_cost = {}
@property
def step_lower_bound(self) -> float:
@ -237,7 +239,8 @@ class FLOW2(Searcher):
''' generate a complete config from the partial config input
add minimal resource to config if available
'''
if self._reset_times: # not the first time, use random gaussian
if self._reset_times and partial_config==self.init_config:
# not the first time to complete init_config, use random gaussian
normalized = self.normalize(partial_config)
for key in normalized:
# don't change unordered cat choice
@ -258,21 +261,22 @@ class FLOW2(Searcher):
normalized[key] = max(l, min(u, normalized[key] + delta))
# use best config for unordered cat choice
config = self.denormalize(normalized)
self._reset_times += 1
else:
# first time init_config, or other configs, take as is
config = partial_config.copy()
for key, value in self.space.items():
if key not in config:
config[key] = value
logger.debug(f'before random {config}')
# logger.debug(f'before random {config}')
for _, generated in generate_variants({'config': config}):
config = generated['config']
break
logger.debug(f'after random {config}')
# logger.debug(f'after random {config}')
if self._resource:
config[self.prune_attr] = self.min_resource
self._reset_times += 1
return config
def create(self, init_config: Dict, obj: float, cost: float) -> Searcher:
@ -442,7 +446,8 @@ class FLOW2(Searcher):
if proposed_by == self.incumbent:
# proposed by current incumbent and no better
self._num_complete4incumbent += 1
cost = result.get(self.cost_attr)
cost = result.get(
self.cost_attr) if result else self._trial_cost.get(trial_id)
if cost: self._cost_complete4incumbent += cost
if self._num_complete4incumbent >= 2*self.dim and \
self._num_allowed4incumbent == 0:
@ -483,6 +488,9 @@ class FLOW2(Searcher):
self._num_allowed4incumbent = 2 * self.dim
self._proposed_by.clear()
self._iter_best_config = self.trial_count
cost = result.get(self.cost_attr)
# record the cost in case it is pruned and cost info is lost
self._trial_cost[trial_id] = cost
def rand_vector_unit_sphere(self, dim) -> np.ndarray:
vec = self._random.normal(0, 1, dim)

View File

@ -1 +1 @@
__version__ = "0.2.4"
__version__ = "0.2.5"

969
notebook/flaml_automl.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook uses the Huggingface transformers library to finetune a transformer model.\n",
"This notebook uses flaml to finetune a transformer model from Huggingface transformers library.\n",
"\n",
"**Requirements.** This notebook has additional requirements:"
]
@ -673,12 +673,14 @@
"metadata": {},
"outputs": [],
"source": [
"max_num_epoch = 4\n",
"max_num_epoch = 64\n",
"search_space = {\n",
" # You can mix constants with search space objects.\n",
" \"num_train_epochs\": flaml.tune.loguniform(1, max_num_epoch),\n",
" \"learning_rate\": flaml.tune.loguniform(1e-6, 1e-4),\n",
" \"adam_epsilon\": flaml.tune.loguniform(1e-9, 1e-7),\n",
" \"adam_beta1\": flaml.tune.uniform(0.8, 0.99),\n",
" \"adam_beta2\": flaml.tune.loguniform(98e-2, 9999e-4),\n",
" }"
]
},
@ -692,12 +694,12 @@
"HP_METRIC, MODE = \"matthews_correlation\", \"max\"\n",
"\n",
"# resources\n",
"num_cpus = 2\n",
"num_gpus = 2\n",
"num_cpus = 4\n",
"num_gpus = 4\n",
"\n",
"# constraints\n",
"num_samples = -1 # number of trials, -1 means unlimited\n",
"time_budget_s = 3600 # time budget in seconds"
"time_budget_s = 10800 # time budget in seconds"
]
},
{

File diff suppressed because one or more lines are too long

View File

@ -274,7 +274,7 @@ class TestAutoML(unittest.TestCase):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"time_budget": 3,
"metric": 'ap',
"task": 'classification',
"log_file_name": "test/sparse_classification.log",

217
test/test_distillbert.py Normal file
View File

@ -0,0 +1,217 @@
'''Require: pip install torch transformers datasets flaml[blendsearch,ray]
'''
import time
import numpy as np
try:
import ray
from datasets import (
load_dataset,
load_metric,
)
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
Trainer,
TrainingArguments,
)
except:
print("pip install torch transformers datasets flaml[blendsearch,ray]")
import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.FileHandler('test/tune_distilbert.log'))
logger.setLevel(logging.INFO)
import flaml
MODEL_CHECKPOINT = "distilbert-base-uncased"
TASK = "cola"
NUM_LABELS = 2
COLUMN_NAME = "sentence"
METRIC_NAME = "matthews_correlation"
# HP_METRIC, MODE = "loss", "min"
HP_METRIC, MODE = "matthews_correlation", "max"
def train_distilbert(config: dict):
# Define tokenize method
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
def tokenize(examples):
return tokenizer(examples[COLUMN_NAME], truncation=True)
# Load CoLA dataset and apply tokenizer
cola_raw = load_dataset("glue", TASK)
cola_encoded = cola_raw.map(tokenize, batched=True)
train_dataset, eval_dataset = cola_encoded["train"], cola_encoded["validation"]
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_CHECKPOINT, num_labels=NUM_LABELS
)
metric = load_metric("glue", TASK)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
output_dir='.',
do_eval=False,
disable_tqdm=True,
logging_steps=20000,
save_total_limit=0,
**config,
)
trainer = Trainer(
model,
training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
# train model
trainer.train()
# evaluate model
eval_output = trainer.evaluate()
flaml.tune.report(
loss=eval_output["eval_loss"],
matthews_correlation=eval_output["eval_matthews_correlation"],
)
def _test_distillbert(method='BlendSearch'):
max_num_epoch = 64
num_samples = -1
time_budget_s = 10800
search_space = {
# You can mix constants with search space objects.
"num_train_epochs": flaml.tune.loguniform(1, max_num_epoch),
"learning_rate": flaml.tune.loguniform(1e-6, 1e-4),
"adam_beta1": flaml.tune.uniform(0.8, 0.99),
"adam_beta2": flaml.tune.loguniform(98e-2, 9999e-4),
"adam_epsilon": flaml.tune.loguniform(1e-9, 1e-7),
}
start_time = time.time()
ray.init(num_cpus=4, num_gpus=4)
if 'ASHA' == method:
algo = None
elif 'BOHB' == method:
from ray.tune.schedulers import HyperBandForBOHB
from ray.tune.suggest.bohb import tuneBOHB
algo = tuneBOHB(max_concurrent=4)
scheduler = HyperBandForBOHB(max_t=max_num_epoch)
elif 'Optuna' == method:
from ray.tune.suggest.optuna import OptunaSearch
algo = OptunaSearch()
elif 'CFO' == method:
from flaml import CFO
algo = CFO(points_to_evaluate=[{
"num_train_epochs": 1,
}])
elif 'BlendSearch' == method:
from flaml import BlendSearch
algo = BlendSearch(points_to_evaluate=[{
"num_train_epochs": 1,
}])
elif 'Dragonfly' == method:
from ray.tune.suggest.dragonfly import DragonflySearch
algo = DragonflySearch()
elif 'SkOpt' == method:
from ray.tune.suggest.skopt import SkOptSearch
algo = SkOptSearch()
elif 'Nevergrad' == method:
from ray.tune.suggest.nevergrad import NevergradSearch
import nevergrad as ng
algo = NevergradSearch(optimizer=ng.optimizers.OnePlusOne)
elif 'ZOOpt' == method:
from ray.tune.suggest.zoopt import ZOOptSearch
algo = ZOOptSearch(budget=num_samples)
elif 'Ax' == method:
from ray.tune.suggest.ax import AxSearch
algo = AxSearch()
elif 'HyperOpt' == method:
from ray.tune.suggest.hyperopt import HyperOptSearch
algo = HyperOptSearch()
scheduler = None
if method != 'BOHB':
from ray.tune.schedulers import ASHAScheduler
scheduler = ASHAScheduler(
max_t=max_num_epoch,
grace_period=1)
scheduler = None
analysis = ray.tune.run(
train_distilbert,
metric=HP_METRIC,
mode=MODE,
# You can add "gpu": 1 to allocate GPUs
resources_per_trial={"gpu": 1},
config=search_space, local_dir='test/logs/',
num_samples=num_samples, time_budget_s=time_budget_s,
keep_checkpoints_num=1, checkpoint_score_attr=HP_METRIC,
scheduler=scheduler, search_alg=algo)
ray.shutdown()
best_trial = analysis.get_best_trial(HP_METRIC, MODE, "all")
metric = best_trial.metric_analysis[HP_METRIC][MODE]
logger.info(f"method={method}")
logger.info(f"n_trials={len(analysis.trials)}")
logger.info(f"time={time.time()-start_time}")
logger.info(f"Best model eval {HP_METRIC}: {metric:.4f}")
logger.info(f"Best model parameters: {best_trial.config}")
def _test_distillbert_cfo():
_test_distillbert('CFO')
def _test_distillbert_dragonfly():
_test_distillbert('Dragonfly')
def _test_distillbert_skopt():
_test_distillbert('SkOpt')
def _test_distillbert_nevergrad():
_test_distillbert('Nevergrad')
def _test_distillbert_zoopt():
_test_distillbert('ZOOpt')
def _test_distillbert_ax():
_test_distillbert('Ax')
def __test_distillbert_hyperopt():
_test_distillbert('HyperOpt')
def _test_distillbert_optuna():
_test_distillbert('Optuna')
def _test_distillbert_asha():
_test_distillbert('ASHA')
def _test_distillbert_bohb():
_test_distillbert('BOHB')
if __name__ == "__main__":
_test_distillbert()

View File

@ -1,4 +1,4 @@
'''Require: pip install torchvision ray
'''Require: pip install torchvision ray flaml[blendsearch]
'''
import unittest
import os
@ -26,7 +26,6 @@ def load_data(data_dir="./data"):
# __load_data_end__
import numpy as np
try:
import torch
import torch.nn as nn

View File

@ -8,11 +8,7 @@ from flaml.model import XGBoostSklearnEstimator
from flaml import tune
# dataset = "blood-transfusion-service-center"
# dataset = "Australian"
dataset = "credit-g"
# dataset = "phoneme"
# dataset = "kc1"
class XGBoost2D(XGBoostSklearnEstimator):
@ -50,8 +46,11 @@ def test_simple(method=None):
"log_type": "all",
"time_budget": 3#6000,
}
X, y = fetch_openml(name=dataset, return_X_y=True)
try:
X, y = fetch_openml(name=dataset, return_X_y=True)
except:
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=42)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)