This commit is contained in:
Chi Wang 2023-09-16 10:57:57 +00:00
parent 4f8e30786c
commit bc4473fe8a
318 changed files with 56 additions and 70662 deletions

View File

@ -1,5 +1,5 @@
[run]
branch = True
source = flaml
source = autogen
omit =
*test*

View File

@ -3,7 +3,7 @@
# Licensed under the MIT License. See LICENSE file in the project root for license information.
#-------------------------------------------------------------------------------------------------------------
FROM mcr.microsoft.com/vscode/devcontainers/python:0-3.9
FROM mcr.microsoft.com/vscode/devcontainers/python:0-3.10
#
# Update the OS and maybe install packages
@ -17,7 +17,6 @@ RUN apt-get update \
&& rm -rf /var/lib/apt/lists/*
ENV DEBIAN_FRONTEND=dialog
# RUN pip3 --disable-pip-version-check --no-cache-dir install flaml
# For docs
RUN npm install --global yarn
RUN pip install pydoc-markdown==4.5.0
RUN pip install pydoc-markdown

View File

@ -1,4 +1,4 @@
<!-- Thank you for your contribution! Please review https://microsoft.github.io/FLAML/docs/Contribute before opening a pull request. -->
<!-- Thank you for your contribution! Please review https://microsoft.github.io/autogen/docs/Contribute before opening a pull request. -->
<!-- Please add a reviewer to the assignee section when you create a PR. If you don't have the access to it, we will shortly find a reviewer and assign them to your PR. -->
@ -12,7 +12,6 @@
## Checks
<!-- - I've used [pre-commit](https://microsoft.github.io/FLAML/docs/Contribute#pre-commit) to lint the changes in this PR (note the same in integrated in our CI checks). -->
- [ ] I've included any doc changes needed for https://microsoft.github.io/FLAML/. See https://microsoft.github.io/FLAML/docs/Contribute#documentation to build and test documentation locally.
- [ ] I've included any doc changes needed for https://microsoft.github.io/autogen/. See https://microsoft.github.io/autogen/docs/Contribute#documentation to build and test documentation locally.
- [ ] I've added tests (if relevant) corresponding to the changes introduced in this PR.
- [ ] I've made sure all auto checks have passed.

View File

@ -7,7 +7,7 @@ on:
push:
branches: ['main']
paths:
- 'flaml/**'
- 'autogen/**'
- 'test/**'
- 'notebook/**'
- '.github/workflows/python-package.yml'

View File

@ -1,18 +1,36 @@
preferred-citation:
type: inproceedings
authors:
- family-names: "Wang"
given-names: "Chi"
affiliation: "Microsoft Research, Redmond WA USA"
- family-names: "Wu"
given-names: "Qingyun"
affiliation: "Penn State University, University Park PA USA"
- family-names: "Bansal"
given-names: "Gargan"
affiliation: "Microsoft Research, Redmond WA USA"
- family-names: "Weimer"
given-names: "Markus"
affiliation: "Microsoft Corporation, Redmond WA USA"
- family-names: "Zhang"
given-names: "Jieyu"
affiliation: "University of Washington, Seattle WA USA"
- family-names: "Wu"
given-names: "Yiran"
affiliation: "Penn State University, University Park PA USA"
- family-names: "Zhang"
given-names: "Shaokun"
affiliation: "Penn State University, University Park PA USA"
- family-names: "Zhu"
given-names: "Eric"
affiliation: "Microsoft Research, Redmond WA USA"
booktitle: "Proceedings of the 4th MLSys Conference"
title: "FLAML: A Fast and Lightweight AutoML Library"
year: 2021
- family-names: "Li"
given-names: "Beibin"
affiliation: "Microsoft Research, Redmond WA USA"
- family-names: "Jiang"
given-names: "Li"
affiliation: "Microsoft Corporation"
- family-names: "Zhang"
given-names: "Xiaoyun"
affiliation: "Microsoft Corporation, Redmond WA USA"
- family-names: "Wang"
given-names: "Chi"
affiliation: "Microsoft Research, Redmond WA USA"
booktitle: "ArXiv preprint arXiv:2308.08155"
title: "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework"
year: 2023

View File

@ -1,30 +1,20 @@
# basic setup
FROM python:3.7
FROM python:3.10
RUN apt-get update && apt-get -y update
RUN apt-get install -y sudo git npm
# Install Spark
RUN sudo apt-get update && sudo apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
ca-certificates-java ca-certificates openjdk-17-jdk-headless \
wget \
&& sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/*
RUN wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz?action=download" -O - | tar -xzC /tmp; archive=$(basename "spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz") bash -c "sudo mv -v /tmp/\${archive/%.tgz/} /spark"
ENV SPARK_HOME=/spark \
PYTHONPATH=/spark/python/lib/py4j-0.10.9.5-src.zip:/spark/python
ENV PATH="${PATH}:${SPARK_HOME}/bin"
# Setup user to not run as root
RUN adduser --disabled-password --gecos '' flaml-dev
RUN adduser flaml-dev sudo
RUN adduser --disabled-password --gecos '' autogen-dev
RUN adduser autogen-dev sudo
RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
USER flaml-dev
USER autogen-dev
# Pull repo
RUN cd /home/flaml-dev && git clone https://github.com/microsoft/FLAML.git
WORKDIR /home/flaml-dev/FLAML
RUN cd /home/autogen-dev && git clone https://github.com/microsoft/autogen.git
WORKDIR /home/autogen-dev/autogen
# Install FLAML (Note: extra components can be installed if needed)
RUN sudo pip install -e .[test,notebook]
# Install autogen (Note: extra components can be installed if needed)
RUN sudo pip install -e .[test]
# Install precommit hooks
RUN pre-commit install

290
NOTICE.md
View File

@ -1,290 +0,0 @@
NOTICES
This repository incorporates material as listed below or described in the code.
#
## Component. Ray.
Code in tune/[analysis.py, sample.py, trial.py, result.py],
searcher/[suggestion.py, variant_generator.py], and scheduler/trial_scheduler.py is adapted from
https://github.com/ray-project/ray/blob/master/python/ray/tune/
## Open Source License/Copyright Notice.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--------------------------------------------------------------------------------
Code in python/ray/rllib/{evolution_strategies, dqn} adapted from
https://github.com/openai (MIT License)
Copyright (c) 2016 OpenAI (http://openai.com)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
--------------------------------------------------------------------------------
Code in python/ray/rllib/impala/vtrace.py from
https://github.com/deepmind/scalable_agent
Copyright 2018 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--------------------------------------------------------------------------------
Code in python/ray/rllib/ars is adapted from https://github.com/modestyachts/ARS
Copyright (c) 2018, ARS contributors (Horia Mania, Aurelia Guy, Benjamin Recht)
All rights reserved.
Redistribution and use of ARS in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
------------------
Code in python/ray/_private/prometheus_exporter.py is adapted from https://github.com/census-instrumentation/opencensus-python/blob/master/contrib/opencensus-ext-prometheus/opencensus/ext/prometheus/stats_exporter/__init__.py

View File

@ -12,6 +12,7 @@ the rights to use your contribution. For details, visit https://cla.opensource.m
[![](https://img.shields.io/discord/1025786666260111483?logo=discord&style=flat)](https://discord.gg/Cppx2vSPVP)
<!-- [![Join the chat at https://gitter.im/FLAMLer/community](https://badges.gitter.im/FLAMLer/community.svg)](https://gitter.im/FLAMLer/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -->
This project is a spinoff from [FLAML](https://github.com/microsoft/FLAML).
# AutoGen

View File

@ -1,5 +0,0 @@
from flaml.automl.automl import AutoML, size
from flaml.automl.logger import logger_formatter
from flaml.automl.state import SearchState, AutoMLState
__all__ = ["AutoML", "AutoMLState", "SearchState", "logger_formatter", "size"]

File diff suppressed because it is too large Load Diff

View File

@ -1,443 +0,0 @@
# !
# * Copyright (c) Microsoft Corporation. All rights reserved.
# * Licensed under the MIT License. See LICENSE file in the
# * project root for license information.
import numpy as np
from datetime import datetime
from typing import TYPE_CHECKING, Union
import os
from flaml.automl.training_log import training_log_reader
from flaml.automl.spark import ps, psDataFrame, psSeries, DataFrame, Series, pd
try:
from scipy.sparse import vstack, issparse
except ImportError:
pass
if TYPE_CHECKING:
from flaml.automl.task import Task
TS_TIMESTAMP_COL = "ds"
TS_VALUE_COL = "y"
def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_format="dataframe"):
"""Load dataset from open ML.
If the file is not cached locally, download it from open ML.
Args:
dataset_id: An integer of the dataset id in openml.
data_dir: A string of the path to store and load the data.
random_state: An integer of the random seed for splitting data.
dataset_format: A string specifying the format of returned dataset. Default is 'dataframe'.
Can choose from ['dataframe', 'array'].
If 'dataframe', the returned dataset will be a Pandas DataFrame.
If 'array', the returned dataset will be a NumPy array or a SciPy sparse matrix.
Returns:
X_train: Training data.
X_test: Test data.
y_train: A series or array of labels for training data.
y_test: A series or array of labels for test data.
"""
import openml
import pickle
from sklearn.model_selection import train_test_split
filename = "openml_ds" + str(dataset_id) + ".pkl"
filepath = os.path.join(data_dir, filename)
if os.path.isfile(filepath):
print("load dataset from", filepath)
with open(filepath, "rb") as f:
dataset = pickle.load(f)
else:
print("download dataset from openml")
dataset = openml.datasets.get_dataset(dataset_id)
if not os.path.exists(data_dir):
os.makedirs(data_dir)
with open(filepath, "wb") as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
print("Dataset name:", dataset.name)
try:
X, y, *__ = dataset.get_data(target=dataset.default_target_attribute, dataset_format=dataset_format)
except ValueError:
from sklearn.datasets import fetch_openml
X, y = fetch_openml(data_id=dataset_id, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
print(
"X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}".format(
X_train.shape,
y_train.shape,
X_test.shape,
y_test.shape,
)
)
return X_train, X_test, y_train, y_test
def load_openml_task(task_id, data_dir):
"""Load task from open ML.
Use the first fold of the task.
If the file is not cached locally, download it from open ML.
Args:
task_id: An integer of the task id in openml.
data_dir: A string of the path to store and load the data.
Returns:
X_train: A dataframe of training data.
X_test: A dataframe of test data.
y_train: A series of labels for training data.
y_test: A series of labels for test data.
"""
import openml
import pickle
task = openml.tasks.get_task(task_id)
filename = "openml_task" + str(task_id) + ".pkl"
filepath = os.path.join(data_dir, filename)
if os.path.isfile(filepath):
print("load dataset from", filepath)
with open(filepath, "rb") as f:
dataset = pickle.load(f)
else:
print("download dataset from openml")
dataset = task.get_dataset()
with open(filepath, "wb") as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
X, y, _, _ = dataset.get_data(task.target_name)
train_indices, test_indices = task.get_train_test_split_indices(
repeat=0,
fold=0,
sample=0,
)
X_train = X.iloc[train_indices]
y_train = y[train_indices]
X_test = X.iloc[test_indices]
y_test = y[test_indices]
print(
"X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}".format(
X_train.shape,
y_train.shape,
X_test.shape,
y_test.shape,
)
)
return X_train, X_test, y_train, y_test
def get_output_from_log(filename, time_budget):
"""Get output from log file.
Args:
filename: A string of the log file name.
time_budget: A float of the time budget in seconds.
Returns:
search_time_list: A list of the finished time of each logged iter.
best_error_list: A list of the best validation error after each logged iter.
error_list: A list of the validation error of each logged iter.
config_list: A list of the estimator, sample size and config of each logged iter.
logged_metric_list: A list of the logged metric of each logged iter.
"""
best_config = None
best_learner = None
best_val_loss = float("+inf")
search_time_list = []
config_list = []
best_error_list = []
error_list = []
logged_metric_list = []
best_config_list = []
with training_log_reader(filename) as reader:
for record in reader.records():
time_used = record.wall_clock_time
val_loss = record.validation_loss
config = record.config
learner = record.learner.split("_")[0]
sample_size = record.sample_size
metric = record.logged_metric
if time_used < time_budget and np.isfinite(val_loss):
if val_loss < best_val_loss:
best_val_loss = val_loss
best_config = config
best_learner = learner
best_config_list.append(best_config)
search_time_list.append(time_used)
best_error_list.append(best_val_loss)
logged_metric_list.append(metric)
error_list.append(val_loss)
config_list.append(
{
"Current Learner": learner,
"Current Sample": sample_size,
"Current Hyper-parameters": record.config,
"Best Learner": best_learner,
"Best Hyper-parameters": best_config,
}
)
return (
search_time_list,
best_error_list,
error_list,
config_list,
logged_metric_list,
)
def concat(X1, X2):
"""concatenate two matrices vertically."""
if type(X1) != type(X2):
if isinstance(X2, (psDataFrame, psSeries)):
X1 = ps.from_pandas(pd.DataFrame(X1))
elif isinstance(X1, (psDataFrame, psSeries)):
X2 = ps.from_pandas(pd.DataFrame(X2))
else:
X1 = pd.DataFrame(X1)
X2 = pd.DataFrame(X2)
if isinstance(X1, (DataFrame, Series)):
df = pd.concat([X1, X2], sort=False)
df.reset_index(drop=True, inplace=True)
if isinstance(X1, DataFrame):
cat_columns = X1.select_dtypes(include="category").columns
if len(cat_columns):
df[cat_columns] = df[cat_columns].astype("category")
return df
if isinstance(X1, (psDataFrame, psSeries)):
df = ps.concat([X1, X2], ignore_index=True)
if isinstance(X1, psDataFrame):
cat_columns = X1.select_dtypes(include="category").columns.values.tolist()
if len(cat_columns):
df[cat_columns] = df[cat_columns].astype("category")
return df
if issparse(X1):
return vstack((X1, X2))
else:
return np.concatenate([X1, X2])
def add_time_idx_col(X):
unique_dates = X[TS_TIMESTAMP_COL].drop_duplicates().sort_values(ascending=True)
# assume no missing timestamps
freq = pd.infer_freq(unique_dates)
if freq == "MS":
X["time_idx"] = X[TS_TIMESTAMP_COL].dt.year * 12 + X[TS_TIMESTAMP_COL].dt.month
elif freq == "Y":
X["time_idx"] = X[TS_TIMESTAMP_COL].dt.year
else:
# using time frequency to generate all time stamps and then indexing for time_idx
# full_range = pd.date_range(X[TS_TIMESTAMP_COL].min(), X[TS_TIMESTAMP_COL].max(), freq=freq).to_list()
# X["time_idx"] = [full_range.index(time) for time in X[TS_TIMESTAMP_COL]]
# taking minimum difference in timestamp
timestamps = unique_dates.view("int64")
freq = int(timestamps.diff().mode())
X["time_idx"] = timestamps - timestamps.min() / freq
X["time_idx"] = X["time_idx"].astype("int")
return X
class DataTransformer:
"""Transform input training data."""
def fit_transform(self, X: Union[DataFrame, np.ndarray], y, task: Union[str, "Task"]):
"""Fit transformer and process the input training data according to the task type.
Args:
X: A numpy array or a pandas dataframe of training data.
y: A numpy array or a pandas series of labels.
task: An instance of type Task, or a str such as 'classification', 'regression'.
Returns:
X: Processed numpy array or pandas dataframe of training data.
y: Processed numpy array or pandas series of labels.
"""
if isinstance(task, str):
from flaml.automl.task.factory import task_factory
task = task_factory(task, X, y)
if task.is_nlp():
# if the mode is NLP, check the type of input, each column must be either string or
# ids (input ids, token type id, attention mask, etc.)
str_columns = []
for column in X.columns:
if isinstance(X[column].iloc[0], str):
str_columns.append(column)
if len(str_columns) > 0:
X[str_columns] = X[str_columns].astype("string")
self._str_columns = str_columns
elif isinstance(X, DataFrame):
X = X.copy()
n = X.shape[0]
cat_columns, num_columns, datetime_columns = [], [], []
drop = False
if task.is_ts_forecast():
X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL})
if task.is_ts_forecastpanel():
if "time_idx" not in X:
X = add_time_idx_col(X)
ds_col = X.pop(TS_TIMESTAMP_COL)
if isinstance(y, Series):
y = y.rename(TS_VALUE_COL)
for column in X.columns:
# sklearn\utils\validation.py needs int/float values
if X[column].dtype.name in ("object", "category"):
if X[column].nunique() == 1 or X[column].nunique(dropna=True) == n - X[column].isnull().sum():
X.drop(columns=column, inplace=True)
drop = True
elif X[column].dtype.name == "category":
current_categories = X[column].cat.categories
if "__NAN__" not in current_categories:
X[column] = X[column].cat.add_categories("__NAN__").fillna("__NAN__")
cat_columns.append(column)
else:
X[column] = X[column].fillna("__NAN__")
cat_columns.append(column)
elif X[column].nunique(dropna=True) < 2:
X.drop(columns=column, inplace=True)
drop = True
else: # datetime or numeric
if X[column].dtype.name == "datetime64[ns]":
tmp_dt = X[column].dt
new_columns_dict = {
f"year_{column}": tmp_dt.year,
f"month_{column}": tmp_dt.month,
f"day_{column}": tmp_dt.day,
f"hour_{column}": tmp_dt.hour,
f"minute_{column}": tmp_dt.minute,
f"second_{column}": tmp_dt.second,
f"dayofweek_{column}": tmp_dt.dayofweek,
f"dayofyear_{column}": tmp_dt.dayofyear,
f"quarter_{column}": tmp_dt.quarter,
}
for key, value in new_columns_dict.items():
if key not in X.columns and value.nunique(dropna=False) >= 2:
X[key] = value
num_columns.append(key)
X[column] = X[column].map(datetime.toordinal)
datetime_columns.append(column)
del tmp_dt
X[column] = X[column].fillna(np.nan)
num_columns.append(column)
X = X[cat_columns + num_columns]
if task.is_ts_forecast():
X.insert(0, TS_TIMESTAMP_COL, ds_col)
if cat_columns:
X[cat_columns] = X[cat_columns].astype("category")
if num_columns:
X_num = X[num_columns]
if np.issubdtype(X_num.columns.dtype, np.integer) and (
drop or min(X_num.columns) != 0 or max(X_num.columns) != X_num.shape[1] - 1
):
X_num.columns = range(X_num.shape[1])
drop = True
else:
drop = False
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
self.transformer = ColumnTransformer(
[
(
"continuous",
SimpleImputer(missing_values=np.nan, strategy="median"),
X_num.columns,
)
]
)
X[num_columns] = self.transformer.fit_transform(X_num)
self._cat_columns, self._num_columns, self._datetime_columns = (
cat_columns,
num_columns,
datetime_columns,
)
self._drop = drop
if task.is_classification() or not pd.api.types.is_numeric_dtype(y) and not task.is_nlg():
if not task.is_token_classification():
from sklearn.preprocessing import LabelEncoder
self.label_transformer = LabelEncoder()
else:
from flaml.automl.nlp.utils import LabelEncoderforTokenClassification
self.label_transformer = LabelEncoderforTokenClassification()
y = self.label_transformer.fit_transform(y)
else:
self.label_transformer = None
self._task = task
return X, y
def transform(self, X: Union[DataFrame, np.array]):
"""Process data using fit transformer.
Args:
X: A numpy array or a pandas dataframe of training data.
Returns:
X: Processed numpy array or pandas dataframe of training data.
"""
X = X.copy()
if self._task.is_nlp():
# if the mode is NLP, check the type of input, each column must be either string or
# ids (input ids, token type id, attention mask, etc.)
if len(self._str_columns) > 0:
X[self._str_columns] = X[self._str_columns].astype("string")
elif isinstance(X, DataFrame):
cat_columns, num_columns, datetime_columns = (
self._cat_columns,
self._num_columns,
self._datetime_columns,
)
if self._task.is_ts_forecast():
X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL})
ds_col = X.pop(TS_TIMESTAMP_COL)
for column in datetime_columns:
tmp_dt = X[column].dt
new_columns_dict = {
f"year_{column}": tmp_dt.year,
f"month_{column}": tmp_dt.month,
f"day_{column}": tmp_dt.day,
f"hour_{column}": tmp_dt.hour,
f"minute_{column}": tmp_dt.minute,
f"second_{column}": tmp_dt.second,
f"dayofweek_{column}": tmp_dt.dayofweek,
f"dayofyear_{column}": tmp_dt.dayofyear,
f"quarter_{column}": tmp_dt.quarter,
}
for new_col_name, new_col_value in new_columns_dict.items():
if new_col_name not in X.columns and new_col_name in num_columns:
X[new_col_name] = new_col_value
X[column] = X[column].map(datetime.toordinal)
del tmp_dt
X = X[cat_columns + num_columns].copy()
if self._task.is_ts_forecast():
X.insert(0, TS_TIMESTAMP_COL, ds_col)
for column in cat_columns:
if X[column].dtype.name == "object":
X[column] = X[column].fillna("__NAN__")
elif X[column].dtype.name == "category":
current_categories = X[column].cat.categories
if "__NAN__" not in current_categories:
X[column] = X[column].cat.add_categories("__NAN__").fillna("__NAN__")
if cat_columns:
X[cat_columns] = X[cat_columns].astype("category")
if num_columns:
X_num = X[num_columns].fillna(np.nan)
if self._drop:
X_num.columns = range(X_num.shape[1])
X[num_columns] = self.transformer.transform(X_num)
return X
def group_counts(groups):
_, i, c = np.unique(groups, return_counts=True, return_index=True)
return c[np.argsort(i)]

View File

@ -1,7 +0,0 @@
import logging
logger = logging.getLogger(__name__)
logger_formatter = logging.Formatter(
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
)
logger.propagate = False

View File

@ -1,606 +0,0 @@
# !
# * Copyright (c) FLAML authors. All rights reserved.
# * Licensed under the MIT License. See LICENSE file in the
# * project root for license information.
import time
from typing import Union, Callable, TypeVar, Optional, Tuple
import logging
import numpy as np
from flaml.automl.data import group_counts
from flaml.automl.task.task import Task
from flaml.automl.model import BaseEstimator, TransformersEstimator
from flaml.automl.spark import psDataFrame, psSeries, ERROR as SPARK_ERROR, Series, DataFrame
try:
from sklearn.metrics import (
mean_squared_error,
r2_score,
roc_auc_score,
accuracy_score,
mean_absolute_error,
log_loss,
average_precision_score,
f1_score,
mean_absolute_percentage_error,
ndcg_score,
)
except ImportError:
pass
if SPARK_ERROR is None:
from flaml.automl.spark.metrics import spark_metric_loss_score
from flaml.automl.time_series import TimeSeriesDataset
logger = logging.getLogger(__name__)
EstimatorSubclass = TypeVar("EstimatorSubclass", bound=BaseEstimator)
sklearn_metric_name_set = {
"r2",
"rmse",
"mae",
"mse",
"accuracy",
"roc_auc",
"roc_auc_ovr",
"roc_auc_ovo",
"roc_auc_weighted",
"roc_auc_ovr_weighted",
"roc_auc_ovo_weighted",
"log_loss",
"mape",
"f1",
"ap",
"ndcg",
"micro_f1",
"macro_f1",
}
huggingface_metric_to_mode = {
"accuracy": "max",
"bertscore": "max",
"bleu": "max",
"bleurt": "max",
"cer": "min",
"chrf": "min",
"code_eval": "max",
"comet": "max",
"competition_math": "max",
"coval": "max",
"cuad": "max",
"f1": "max",
"gleu": "max",
"google_bleu": "max",
"matthews_correlation": "max",
"meteor": "max",
"pearsonr": "max",
"precision": "max",
"recall": "max",
"rouge": "max",
"sacrebleu": "max",
"sari": "max",
"seqeval": "max",
"spearmanr": "max",
"ter": "min",
"wer": "min",
}
huggingface_submetric_to_metric = {"rouge1": "rouge", "rouge2": "rouge"}
def metric_loss_score(
metric_name: str,
y_processed_predict,
y_processed_true,
labels=None,
sample_weight=None,
groups=None,
):
# y_processed_predict and y_processed_true are processed id labels if the original were the token labels
if isinstance(y_processed_predict, (psDataFrame, psSeries)):
return spark_metric_loss_score(
metric_name,
y_processed_predict,
y_processed_true,
sample_weight,
groups,
)
elif is_in_sklearn_metric_name_set(metric_name):
return sklearn_metric_loss_score(
metric_name,
y_processed_predict,
y_processed_true,
labels,
sample_weight,
groups,
)
else:
try:
import datasets
datasets_metric_name = huggingface_submetric_to_metric.get(metric_name, metric_name.split(":")[0])
metric = datasets.load_metric(datasets_metric_name)
metric_mode = huggingface_metric_to_mode[datasets_metric_name]
if metric_name.startswith("seqeval"):
y_processed_true = [[labels[tr] for tr in each_list] for each_list in y_processed_true]
elif metric in ("pearsonr", "spearmanr"):
y_processed_true = (
y_processed_true.to_list() if isinstance(y_processed_true, Series) else list(y_processed_true)
)
score_dict = metric.compute(predictions=y_processed_predict, references=y_processed_true)
if "rouge" in metric_name:
score = score_dict[metric_name].mid.fmeasure
elif metric_name.startswith("seqeval"):
metric_submetric_names = metric_name.split(":")
score = score_dict[metric_submetric_names[1] if len(metric_submetric_names) > 1 else "overall_accuracy"]
else:
score = score_dict[metric_name]
except ImportError:
raise ValueError(
metric_name + " is not an built-in sklearn metric and [hf] is not installed. "
"Currently built-in sklearn metrics are: "
"r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
"log_loss, mape, f1, micro_f1, macro_f1, ap. "
"If the metric is a huggingface metric, please pip install flaml[hf] ",
"or pass a customized metric function to AutoML.fit(metric=func)",
)
# If the metric is not found from huggingface dataset metric list (i.e., FileNotFoundError)
# ask the user to provide a custom metric
except FileNotFoundError:
raise ValueError(
metric_name + " is neither an sklearn metric nor a huggingface metric. "
"Currently built-in sklearn metrics are: "
"r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
"log_loss, mape, f1, micro_f1, macro_f1, ap. "
"Currently built-in huggingface metrics are: "
+ ", ".join(huggingface_metric_to_mode.keys())
+ ". Please pass a customized metric function to AutoML.fit(metric=func)"
)
if metric_mode == "max":
return 1 - score
else:
return score
def is_in_sklearn_metric_name_set(metric_name: str):
return metric_name.startswith("ndcg") or metric_name in sklearn_metric_name_set
def is_min_metric(metric_name: str):
return (
metric_name in ["rmse", "mae", "mse", "log_loss", "mape"]
or huggingface_metric_to_mode.get(metric_name, None) == "min"
)
def sklearn_metric_loss_score(
metric_name: str,
y_predict,
y_true,
labels=None,
sample_weight=None,
groups=None,
):
"""Loss using the specified metric.
Args:
metric_name: A string of the metric name, one of
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
'roc_auc_ovo', 'roc_auc_weighted', 'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted',
'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'.
y_predict: A 1d or 2d numpy array of the predictions which can be
used to calculate the metric. E.g., 2d for log_loss and 1d
for others.
y_true: A 1d numpy array of the true labels.
labels: A list or an array of the unique labels.
sample_weight: A 1d numpy array of the sample weight.
groups: A 1d numpy array of the group labels.
Returns:
score: A float number of the loss, the lower the better.
"""
metric_name = metric_name.lower()
if "r2" == metric_name:
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "rmse":
score = np.sqrt(mean_squared_error(y_true, y_predict, sample_weight=sample_weight))
elif metric_name == "mae":
score = mean_absolute_error(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "mse":
score = mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "accuracy":
score = 1.0 - accuracy_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "roc_auc":
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "roc_auc_ovr":
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight, multi_class="ovr")
elif metric_name == "roc_auc_ovo":
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight, multi_class="ovo")
elif metric_name == "roc_auc_weighted":
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight, average="weighted")
elif metric_name == "roc_auc_ovo_weighted":
score = 1.0 - roc_auc_score(
y_true,
y_predict,
sample_weight=sample_weight,
average="weighted",
multi_class="ovo",
)
elif metric_name == "roc_auc_ovr_weighted":
score = 1.0 - roc_auc_score(
y_true,
y_predict,
sample_weight=sample_weight,
average="weighted",
multi_class="ovr",
)
elif "log_loss" == metric_name:
score = log_loss(y_true, y_predict, labels=labels, sample_weight=sample_weight)
elif "mape" == metric_name:
try:
score = mean_absolute_percentage_error(y_true, y_predict)
except ValueError:
return np.inf
elif "micro_f1" == metric_name:
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight, average="micro")
elif "macro_f1" == metric_name:
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight, average="macro")
elif "f1" == metric_name:
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
elif "ap" == metric_name:
score = 1 - average_precision_score(y_true, y_predict, sample_weight=sample_weight)
elif "ndcg" in metric_name:
if "@" in metric_name:
k = int(metric_name.split("@", 1)[-1])
counts = group_counts(groups)
score = 0
psum = 0
for c in counts:
score -= ndcg_score(
np.asarray([y_true[psum : psum + c]]),
np.asarray([y_predict[psum : psum + c]]),
k=k,
)
psum += c
score /= len(counts)
score += 1
else:
score = 1 - ndcg_score([y_true], [y_predict])
return score
def get_y_pred(estimator, X, eval_metric, task: Task):
if eval_metric in ["roc_auc", "ap", "roc_auc_weighted"] and task.is_binary():
y_pred_classes = estimator.predict_proba(X)
if isinstance(y_pred_classes, (psSeries, psDataFrame)):
y_pred = y_pred_classes
else:
y_pred = y_pred_classes[:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
elif eval_metric in [
"log_loss",
"roc_auc",
"roc_auc_ovr",
"roc_auc_ovo",
"roc_auc_ovo_weighted",
"roc_auc_ovr_weighted",
]:
y_pred = estimator.predict_proba(X)
else:
y_pred = estimator.predict(X)
if isinstance(y_pred, Series) or isinstance(y_pred, DataFrame):
y_pred = y_pred.values
return y_pred
def to_numpy(x):
if isinstance(x, Series or isinstance(x, DataFrame)):
x = x.values
else:
x = np.ndarray(x)
return x.reshape((-1, 1))
def compute_estimator(
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
budget,
kf,
config_dic: dict,
task: Union[str, Task],
estimator_name: str,
eval_method: str,
eval_metric: Union[str, Callable],
best_val_loss=np.Inf,
n_jobs: Optional[int] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
estimator_class: Optional[EstimatorSubclass] = None,
cv_score_agg_func: Optional[callable] = None,
log_training_metric: Optional[bool] = False,
fit_kwargs: Optional[dict] = None,
free_mem_ratio=0,
):
if fit_kwargs is None:
fit_kwargs = {}
estimator_class = estimator_class or task.estimator_class_from_str(estimator_name)
estimator = estimator_class(
**config_dic,
task=task,
n_jobs=n_jobs,
)
if isinstance(estimator, TransformersEstimator):
# TODO: move the partial function to nlp
fit_kwargs["metric"] = eval_metric
fit_kwargs["X_val"] = X_val
fit_kwargs["y_val"] = y_val
if "holdout" == eval_method:
val_loss, metric_for_logging, train_time, pred_time = get_val_loss(
config_dic,
estimator,
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
task,
labels=fit_kwargs.get("label_list"), # pass the label list on to compute the evaluation metric
budget=budget,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
free_mem_ratio=0,
)
else:
val_loss, metric_for_logging, train_time, pred_time = task.evaluate_model_CV(
config_dic,
estimator,
X_train,
y_train,
budget,
kf,
eval_metric,
best_val_loss,
cv_score_agg_func,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
free_mem_ratio=0,
)
if isinstance(estimator, TransformersEstimator):
del fit_kwargs["metric"], fit_kwargs["X_val"], fit_kwargs["y_val"]
return estimator, val_loss, metric_for_logging, train_time, pred_time
def train_estimator(
config_dic: dict,
X_train,
y_train,
task: str,
estimator_name: str,
n_jobs: Optional[int] = 1, # some estimators of EstimatorSubclass don't accept n_jobs. Should be None in that case.
estimator_class: Optional[EstimatorSubclass] = None,
budget=None,
fit_kwargs: Optional[dict] = None,
eval_metric=None,
free_mem_ratio=0,
) -> Tuple[EstimatorSubclass, float]:
start_time = time.time()
estimator_class = estimator_class or task.estimator_class_from_str(estimator_name)
estimator = estimator_class(
**config_dic,
task=task,
n_jobs=n_jobs,
)
if fit_kwargs is None:
fit_kwargs = {}
if isinstance(estimator, TransformersEstimator):
fit_kwargs["metric"] = eval_metric
if X_train is not None:
train_time = estimator.fit(X_train, y_train, budget=budget, free_mem_ratio=free_mem_ratio, **fit_kwargs)
else:
estimator = estimator.estimator_class(**estimator.params)
train_time = time.time() - start_time
return estimator, train_time
def norm_confusion_matrix(y_true: Union[np.array, Series], y_pred: Union[np.array, Series]):
"""normalized confusion matrix.
Args:
estimator: A multi-class classification estimator.
y_true: A numpy array or a pandas series of true labels.
y_pred: A numpy array or a pandas series of predicted labels.
Returns:
A normalized confusion matrix.
"""
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_true, y_pred)
norm_conf_mat = conf_mat.astype("float") / conf_mat.sum(axis=1)[:, np.newaxis]
return norm_conf_mat
def multi_class_curves(
y_true: Union[np.array, Series],
y_pred_proba: Union[np.array, Series],
curve_func: Callable,
):
"""Binarize the data for multi-class tasks and produce ROC or precision-recall curves.
Args:
y_true: A numpy array or a pandas series of true labels.
y_pred_proba: A numpy array or a pandas dataframe of predicted probabilites.
curve_func: A function to produce a curve (e.g., roc_curve or precision_recall_curve).
Returns:
A tuple of two dictionaries with the same set of keys (class indices).
The first dictionary curve_x stores the x coordinates of each curve, e.g.,
curve_x[0] is an 1D array of the x coordinates of class 0.
The second dictionary curve_y stores the y coordinates of each curve, e.g.,
curve_y[0] is an 1D array of the y coordinates of class 0.
"""
from sklearn.preprocessing import label_binarize
classes = np.unique(y_true)
y_true_binary = label_binarize(y_true, classes=classes)
curve_x, curve_y = {}, {}
for i in range(len(classes)):
curve_x[i], curve_y[i], _ = curve_func(y_true_binary[:, i], y_pred_proba[:, i])
return curve_x, curve_y
def get_val_loss(
config,
estimator,
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
task,
labels=None,
budget=None,
log_training_metric=False,
fit_kwargs={},
free_mem_ratio=0,
):
start = time.time()
# if groups_val is not None:
# fit_kwargs['groups_val'] = groups_val
# fit_kwargs['X_val'] = X_val
# fit_kwargs['y_val'] = y_val
estimator.fit(X_train, y_train, budget=budget, free_mem_ratio=free_mem_ratio, **fit_kwargs)
val_loss, metric_for_logging, pred_time, _ = _eval_estimator(
config,
estimator,
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
task,
labels,
log_training_metric,
fit_kwargs,
)
if hasattr(estimator, "intermediate_results"):
metric_for_logging["intermediate_results"] = estimator.intermediate_results
train_time = time.time() - start
return val_loss, metric_for_logging, train_time, pred_time
def default_cv_score_agg_func(val_loss_folds, log_metrics_folds):
metric_to_minimize = sum(val_loss_folds) / len(val_loss_folds)
metrics_to_log = None
for single_fold in log_metrics_folds:
if metrics_to_log is None:
metrics_to_log = single_fold
elif isinstance(metrics_to_log, dict):
metrics_to_log = {k: metrics_to_log[k] + v for k, v in single_fold.items()}
else:
metrics_to_log += single_fold
if metrics_to_log:
n = len(val_loss_folds)
metrics_to_log = (
{k: v / n for k, v in metrics_to_log.items()} if isinstance(metrics_to_log, dict) else metrics_to_log / n
)
return metric_to_minimize, metrics_to_log
def _eval_estimator(
config,
estimator,
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
task,
labels=None,
log_training_metric=False,
fit_kwargs={},
):
if isinstance(eval_metric, str):
pred_start = time.time()
val_pred_y = get_y_pred(estimator, X_val, eval_metric, task)
# TODO: why are integer labels being cast to str in the first place?
if isinstance(val_pred_y, Series) or isinstance(val_pred_y, DataFrame) or isinstance(val_pred_y, np.ndarray):
test = val_pred_y if isinstance(val_pred_y, np.ndarray) else val_pred_y.values
if not np.issubdtype(test.dtype, np.number):
# some NLP models return a list
val_pred_y = val_pred_y.astype(str)
if isinstance(X_val, TimeSeriesDataset):
num_val_rows = len(X_val.test_data)
y_val = X_val.test_data[X_val.target_names].values.astype(val_pred_y.dtype)
y_train = X_val.train_data[X_val.target_names].values.astype(val_pred_y.dtype)
else:
num_val_rows = X_val.shape[0]
pred_time = (time.time() - pred_start) / num_val_rows
val_loss = metric_loss_score(
eval_metric,
y_processed_predict=val_pred_y,
y_processed_true=y_val,
labels=labels,
sample_weight=weight_val,
groups=groups_val,
)
metric_for_logging = {"pred_time": pred_time}
if log_training_metric:
train_pred_y = get_y_pred(estimator, X_train, eval_metric, task)
metric_for_logging["train_loss"] = metric_loss_score(
eval_metric,
train_pred_y,
y_train,
labels,
fit_kwargs.get("sample_weight"),
fit_kwargs.get("groups"),
)
else: # customized metric function
val_loss, metric_for_logging = eval_metric(
X_val,
y_val,
estimator,
labels,
X_train,
y_train,
weight_val,
fit_kwargs.get("sample_weight"),
config,
groups_val,
fit_kwargs.get("groups"),
)
pred_time = metric_for_logging.get("pred_time", 0)
val_pred_y = None
# eval_metric may return val_pred_y but not necessarily. Setting None for now.
return val_loss, metric_for_logging, pred_time, val_pred_y

File diff suppressed because it is too large Load Diff

View File

@ -1,25 +0,0 @@
# AutoML for NLP
This directory contains utility functions used by AutoNLP. Currently we support four NLP tasks: sequence classification, sequence regression, multiple choice and summarization.
Please refer to this [link](https://microsoft.github.io/FLAML/docs/Examples/AutoML-NLP) for examples.
# Troubleshooting fine-tuning HPO for pre-trained language models
The frequent updates of transformers may lead to fluctuations in the results of tuning. To help users quickly troubleshoot the result of AutoNLP when a tuning failure occurs (e.g., failing to reproduce previous results), we have provided the following jupyter notebook:
* [Troubleshooting HPO for fine-tuning pre-trained language models](https://github.com/microsoft/FLAML/blob/main/notebook/research/acl2021.ipynb)
Our findings on troubleshooting fine-tuning the Electra and RoBERTa model for the GLUE dataset can be seen in the following paper published in ACL 2021:
* [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://arxiv.org/abs/2106.09204). Xueqing Liu, Chi Wang. ACL-IJCNLP 2021.
```bibtex
@inproceedings{liu2021hpo,
title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},
author={Xueqing Liu and Chi Wang},
year={2021},
booktitle={ACL-IJCNLP},
}
```

View File

@ -1,50 +0,0 @@
from dataclasses import dataclass
from transformers.data.data_collator import (
DataCollatorWithPadding,
DataCollatorForTokenClassification,
DataCollatorForSeq2Seq,
)
from collections import OrderedDict
from flaml.automl.task.task import (
TOKENCLASSIFICATION,
MULTICHOICECLASSIFICATION,
SUMMARIZATION,
SEQCLASSIFICATION,
SEQREGRESSION,
)
@dataclass
class DataCollatorForMultipleChoiceClassification(DataCollatorWithPadding):
def __call__(self, features):
from itertools import chain
import torch
label_name = "label" if "label" in features[0].keys() else "labels"
labels = [feature.pop(label_name) for feature in features] if label_name in features[0] else None
batch_size = len(features)
num_choices = len(features[0]["input_ids"])
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = list(chain(*flattened_features))
batch = super(DataCollatorForMultipleChoiceClassification, self).__call__(flattened_features)
# Un-flatten
batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
# Add back labels
if labels:
batch["labels"] = torch.tensor(labels, dtype=torch.int64)
return batch
task_to_datacollator_class = OrderedDict(
[
(TOKENCLASSIFICATION, DataCollatorForTokenClassification),
(MULTICHOICECLASSIFICATION, DataCollatorForMultipleChoiceClassification),
(SUMMARIZATION, DataCollatorForSeq2Seq),
(SEQCLASSIFICATION, DataCollatorWithPadding),
(SEQREGRESSION, DataCollatorWithPadding),
]
)

View File

@ -1,90 +0,0 @@
import os
try:
from transformers import Seq2SeqTrainer
except ImportError:
Seq2SeqTrainer = object
class TrainerForAuto(Seq2SeqTrainer):
def predict(
self,
test_dataset,
ignore_keys=None,
metric_key_prefix=None,
max_length=None,
num_beams=None,
):
if getattr(self, "_is_seq2seq", None):
return super().predict(
test_dataset,
ignore_keys,
metric_key_prefix=metric_key_prefix,
max_length=max_length,
num_beams=num_beams,
)
else:
return super(Seq2SeqTrainer, self).predict(test_dataset, ignore_keys, metric_key_prefix)
def prediction_step(
self,
model,
inputs,
prediction_loss_only,
ignore_keys,
):
if getattr(self, "_is_seq2seq", None):
return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
else:
return super(Seq2SeqTrainer, self).prediction_step(model, inputs, prediction_loss_only, ignore_keys)
def log(self, logs) -> None:
if getattr(self, "_is_seq2seq", None):
super().log(logs)
else:
super(Seq2SeqTrainer, self).log(logs)
if not hasattr(self, "intermediate_results"):
self.intermediate_results = {}
epoch_num = logs.get("epoch", None)
if epoch_num:
self.intermediate_results.setdefault(epoch_num, {})
self.intermediate_results[epoch_num].update(logs)
def evaluate(
self,
eval_dataset=None,
ignore_keys=None,
metric_key_prefix="eval",
):
"""Overriding transformers.Trainer.evaluate by saving metrics and checkpoint path."""
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
ckpt_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
# TODO: if your task is seq2seq (i.e., SUMMARIZATION), uncomment the code below (add indentation before metrics = eval_dataset...
if getattr(self, "_is_seq2seq", None):
metrics = eval_dataset and super().evaluate(
eval_dataset,
ignore_keys,
metric_key_prefix,
max_length=self.args.generation_max_length,
num_beams=self.args.generation_num_beams,
)
else:
metrics = eval_dataset and super(Seq2SeqTrainer, self).evaluate(
eval_dataset,
ignore_keys,
metric_key_prefix,
)
if hasattr(self, "ckpt_to_global_step"):
self.ckpt_to_global_step[ckpt_dir] = self.state.global_step
if metrics:
self.ckpt_to_metric[ckpt_dir] = metrics
else:
self.ckpt_to_global_step = {ckpt_dir: self.state.global_step}
self.ckpt_to_metric = {ckpt_dir: metrics} if metrics else {}
return metrics

View File

@ -1,128 +0,0 @@
import argparse
from dataclasses import dataclass, field
from typing import Optional, List
from flaml.automl.task.task import NLG_TASKS
try:
from transformers import TrainingArguments
except ImportError:
TrainingArguments = object
@dataclass
class TrainingArgumentsForAuto(TrainingArguments):
"""FLAML custom TrainingArguments.
Args:
task (str): the task name for NLP tasks, e.g., seq-classification, token-classification
output_dir (str): data root directory for outputing the log, etc.
model_path (str, optional, defaults to "facebook/muppet-roberta-base"): A string,
the path of the language model file, either a path from huggingface
model card huggingface.co/models, or a local path for the model.
fp16 (bool, optional, defaults to "False"): A bool, whether to use FP16.
max_seq_length (int, optional, defaults to 128): An integer, the max length of the sequence.
For token classification task, this argument will be ineffective.
pad_to_max_length (bool, optional, defaults to "False"):
whether to pad all samples to model maximum sentence length.
If False, will pad the samples dynamically when batching to the maximum length in the batch.
per_device_eval_batch_size (int, optional, defaults to 1): An integer, the per gpu evaluation batch size.
label_list (List[str], optional, defaults to None): A list of string, the string list of the label names.
When the task is sequence labeling/token classification, there are two formats of the labels:
(1) The token labels, i.e., [B-PER, I-PER, B-LOC]; (2) Id labels. For (2), need to pass the label_list (e.g., [B-PER, I-PER, B-LOC])
to convert the Id to token labels when computing the metric with metric_loss_score.
See the example in [a simple token classification example](/docs/Examples/AutoML-NLP#a-simple-token-classification-example).
"""
task: str = field(default="seq-classification")
output_dir: str = field(default="data/output/", metadata={"help": "data dir"})
model_path: str = field(
default="facebook/muppet-roberta-base",
metadata={
"help": "model path for HPO natural language understanding tasks, default is set to facebook/muppet-roberta-base"
},
)
fp16: bool = field(default=True, metadata={"help": "whether to use the FP16 mode"})
max_seq_length: int = field(default=128, metadata={"help": "max seq length"})
label_all_tokens: bool = field(
default=False,
metadata={
"help": "For NER task, whether to set the extra tokenized labels to the same label (instead of -100)"
},
)
pad_to_max_length: bool = field(
default=False,
metadata={
"help": "Whether to pad all samples to model maximum sentence length. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch. "
},
)
per_device_eval_batch_size: int = field(
default=1,
metadata={"help": "per gpu evaluation batch size"},
)
label_list: Optional[List[str]] = field(default=None, metadata={"help": "The string list of the label names. "})
eval_steps: int = field(default=500, metadata={"help": "Run an evaluation every X steps."})
save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
@staticmethod
def load_args_from_console():
from dataclasses import fields
arg_parser = argparse.ArgumentParser()
for each_field in fields(TrainingArgumentsForAuto):
print(each_field)
arg_parser.add_argument(
"--" + each_field.name,
type=each_field.type,
help=each_field.metadata["help"],
required=each_field.metadata["required"] if "required" in each_field.metadata else False,
choices=each_field.metadata["choices"] if "choices" in each_field.metadata else None,
default=each_field.default,
)
console_args, unknown = arg_parser.parse_known_args()
return console_args
@dataclass
class Seq2SeqTrainingArgumentsForAuto(TrainingArgumentsForAuto):
model_path: str = field(
default="t5-small",
metadata={"help": "model path for HPO natural language generation tasks, default is set to t5-small"},
)
sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."})
predict_with_generate: bool = field(
default=True,
metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."},
)
generation_max_length: Optional[int] = field(
default=None,
metadata={
"help": "The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default "
"to the `max_length` value of the model configuration."
},
)
generation_num_beams: Optional[int] = field(
default=None,
metadata={
"help": "The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default "
"to the `num_beams` value of the model configuration."
},
)
def __post_init__(self):
super().__post_init__()
if self.task in NLG_TASKS:
self.model_path = "t5-small"

View File

@ -1,422 +0,0 @@
from itertools import chain
import numpy as np
from flaml.automl.task.task import (
SUMMARIZATION,
SEQREGRESSION,
SEQCLASSIFICATION,
MULTICHOICECLASSIFICATION,
TOKENCLASSIFICATION,
NLG_TASKS,
)
from flaml.automl.data import pd
def todf(X, Y, column_name):
"""
todf converts Y from any format (list, pandas.Series, numpy array) to a DataFrame before being returned
"""
if Y is not None:
Y = pd.DataFrame(Y, index=X.index)
Y.columns = column_name
return Y
def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
label_col_name = None
# label_col_name is the name of the label column Y, label_col_name = ['labels'] for TOKENCLASSIFICATION and SUMMARIZATION,
# label_col_name = ['label'] for other tasks. todf is used by all tasks except for SUMMARIZATION,
# because the outputs of tokenize_seq2seq are already two DataFrames so no conversion needed.
if task in (SEQCLASSIFICATION, SEQREGRESSION):
X_tokenized = tokenize_onedataframe(
X,
tokenizer=tokenizer,
task=task,
hf_args=hf_args,
prefix_str="",
)
Y_tokenized = Y
label_col_name = ["label"]
elif task == TOKENCLASSIFICATION:
X_tokenized, Y_tokenized = tokenize_text_tokclassification(X, Y, tokenizer=tokenizer, hf_args=hf_args)
label_col_name = ["labels"]
elif task in NLG_TASKS:
return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args)
elif task == MULTICHOICECLASSIFICATION:
X_tokenized = tokenize_text_multiplechoice(X, tokenizer=tokenizer, hf_args=hf_args)
label_col_name = ["label"]
Y_tokenized = Y
Y_tokenized = todf(X_tokenized, Y_tokenized, label_col_name)
return X_tokenized, Y_tokenized
def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
model_inputs = tokenize_onedataframe(
X,
tokenizer=tokenizer,
task=task,
hf_args=hf_args,
prefix_str="summarize: ",
)
model_outputs = None
if Y is not None:
model_outputs = tokenize_onedataframe(
Y.to_frame(),
tokenizer=tokenizer,
task=task,
hf_args=hf_args,
prefix_str="",
)
model_outputs["labels"] = [
[(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label]
for label in model_outputs["input_ids"]
]
model_outputs = model_outputs.drop(columns=["attention_mask", "input_ids", "decoder_input_ids"])
return model_inputs, model_outputs
def tokenize_and_align_labels(
examples,
tokenizer,
label_to_id,
b_to_i_label,
hf_args=None,
X_sent_key=None,
Y_sent_key=None,
return_column_name=False,
):
# tokenize_and_align_labels is only called by the token-classification task
tokenized_inputs = tokenizer(
[list(examples[X_sent_key])],
padding="max_length"
if hf_args and hf_args.pad_to_max_length
else False, # to be consistent with https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner.py#L394
truncation=True,
max_length=hf_args.max_seq_length if hf_args else None,
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
is_split_into_words=True,
)
if Y_sent_key is not None:
previous_word_idx = None
label_ids = []
for word_idx in tokenized_inputs.word_ids(batch_index=0):
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(label_to_id[examples[Y_sent_key][word_idx]])
# For the other tokens in a word, we set the label to either the current label or -100, depending on
# the label_all_tokens flag.
else:
# Use the label_all_tokens to control whether to copy the label to all subtokens or to pad the additional tokens as -100
if hf_args.label_all_tokens:
# If the B- word is converted into multiple subtokens, map the additional subtokens to I-
label_ids.append(b_to_i_label[label_to_id[examples[Y_sent_key][word_idx]]])
else:
label_ids.append(-100)
previous_word_idx = word_idx
tokenized_inputs["labels"] = label_ids
tmp_column_names = sorted(tokenized_inputs.keys())
tokenized_input_and_labels = [tokenized_inputs[x] for x in tmp_column_names]
for key_idx, each_key in enumerate(tmp_column_names):
if each_key != "labels":
tokenized_input_and_labels[key_idx] = tokenized_input_and_labels[key_idx][0]
if return_column_name:
return tokenized_input_and_labels, tmp_column_names
else:
return tokenized_input_and_labels
def tokenize_text_tokclassification(X, Y, tokenizer, hf_args=None):
# If the label_all_tokens flag is True, prepare two dicts label_to_id and b_to_i_label to convert the B- labels to I- labels
label_to_id = {i: i for i in range(len(hf_args.label_list))}
b_to_i_label = []
for idx, label in enumerate(hf_args.label_list):
if label.startswith("B-") and label.replace("B-", "I-") in hf_args.label_list:
b_to_i_label.append(hf_args.label_list.index(label.replace("B-", "I-")))
else:
b_to_i_label.append(idx)
if Y is not None:
X_and_Y = pd.concat([X, Y.to_frame()], axis=1)
X_key = list(X.keys())[0]
Y_key = list(Y.to_frame().keys())[0]
# tokenize_and_align_labels is only called by the token-classification task
_, tokenized_column_names = tokenize_and_align_labels(
X_and_Y.iloc[0],
tokenizer=tokenizer,
hf_args=hf_args,
X_sent_key=X_key,
Y_sent_key=Y_key,
return_column_name=True,
label_to_id=label_to_id,
b_to_i_label=b_to_i_label,
)
X_and_Y_tokenized = X_and_Y.apply(
lambda x: tokenize_and_align_labels(
x,
tokenizer=tokenizer,
hf_args=hf_args,
X_sent_key=X_key,
Y_sent_key=Y_key,
label_to_id=label_to_id,
b_to_i_label=b_to_i_label,
),
axis=1,
result_type="expand",
)
label_idx = tokenized_column_names.index("labels")
other_indices = sorted(set(range(len(tokenized_column_names))).difference({label_idx}))
other_column_names = [tokenized_column_names[x] for x in other_indices]
d = X_and_Y_tokenized.iloc[:, other_indices]
y_tokenized = X_and_Y_tokenized.iloc[:, label_idx]
else:
X_key = list(X.keys())[0]
_, tokenized_column_names = tokenize_and_align_labels(
X.iloc[0],
tokenizer=tokenizer,
hf_args=hf_args,
X_sent_key=X_key,
Y_sent_key=None,
return_column_name=True,
label_to_id=label_to_id,
b_to_i_label=b_to_i_label,
)
d = X.apply(
lambda x: tokenize_and_align_labels(
x,
tokenizer=tokenizer,
hf_args=hf_args,
X_sent_key=X_key,
Y_sent_key=None,
label_to_id=label_to_id,
b_to_i_label=b_to_i_label,
),
axis=1,
result_type="expand",
)
other_column_names = tokenized_column_names
y_tokenized = None
X_tokenized = pd.DataFrame(columns=other_column_names)
X_tokenized[other_column_names] = d
return X_tokenized, y_tokenized
def tokenize_onedataframe(
X,
tokenizer,
task=None,
hf_args=None,
prefix_str=None,
):
with tokenizer.as_target_tokenizer():
_, tokenized_column_names = tokenize_row(
dict(X.iloc[0]),
tokenizer,
prefix=(prefix_str,) if task is SUMMARIZATION else None,
task=task,
hf_args=hf_args,
return_column_name=True,
)
d = X.apply(
lambda x: tokenize_row(
x,
tokenizer,
prefix=(prefix_str,) if task is SUMMARIZATION else None,
task=task,
hf_args=hf_args,
),
axis=1,
result_type="expand",
)
X_tokenized = pd.DataFrame(columns=tokenized_column_names)
X_tokenized[tokenized_column_names] = d
return X_tokenized
def tokenize_row(
this_row,
tokenizer,
prefix=None,
task=None,
hf_args=None,
return_column_name=False,
):
if prefix:
this_row = tuple(["".join(x) for x in zip(prefix, this_row)])
# tokenizer.pad_token = tokenizer.eos_token
tokenized_example = tokenizer(
*tuple(this_row),
padding="max_length" if hf_args and hf_args.pad_to_max_length else False,
max_length=hf_args.max_seq_length if hf_args else None,
truncation=True,
)
if task in NLG_TASKS:
tokenized_example["decoder_input_ids"] = tokenized_example["input_ids"]
tmp_column_names = sorted(tokenized_example.keys())
if return_column_name:
return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
else:
return [tokenized_example[x] for x in tmp_column_names]
def tokenize_text_multiplechoice(X, tokenizer, hf_args=None):
t = X[["sent1", "sent2", "ending0", "ending1", "ending2", "ending3"]]
_, tokenized_column_names = tokenize_swag(
t.iloc[0],
tokenizer=tokenizer,
hf_args=hf_args,
return_column_name=True,
)
d = t.apply(
lambda x: tokenize_swag(x, tokenizer=tokenizer, hf_args=hf_args),
axis=1,
result_type="expand",
)
X_tokenized = pd.DataFrame(columns=tokenized_column_names)
X_tokenized[tokenized_column_names] = d
output = X_tokenized.join(X)
return output
def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
first_sentences = [[this_row["sent1"]] * 4]
# get each 1st sentence, multiply to 4 sentences
question_headers = this_row["sent2"]
# sent2 are the noun part of 2nd line
second_sentences = [question_headers + " " + this_row[key] for key in ["ending0", "ending1", "ending2", "ending3"]]
# now the 2nd-sentences are formed by combing the noun part and 4 ending parts
# Flatten out
# From 2 dimension to 1 dimension array
first_sentences = list(chain(*first_sentences))
tokenized_example = tokenizer(
*tuple([first_sentences, second_sentences]),
truncation=True,
max_length=hf_args.max_seq_length if hf_args else None,
padding="max_length" if hf_args and hf_args.pad_to_max_length else False,
)
tmp_column_names = sorted(tokenized_example.keys())
if return_column_name:
return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
else:
return [tokenized_example[x] for x in tmp_column_names]
def postprocess_prediction_and_true(task, y_pred, tokenizer, hf_args, y_true=None, X=None):
# postprocess the matrix prediction y_pred and ground truth y_true into user readable format, e.g., for summarization, decode into text
if y_pred is None:
return np.array([0.0] * len(X)), y_true
if task == SEQCLASSIFICATION:
return np.argmax(y_pred, axis=1), y_true
elif task == SEQREGRESSION:
return np.squeeze(y_pred), y_true # predictions.reshape((len(predictions),))
elif task == TOKENCLASSIFICATION:
assert (y_true is not None) or (X is not None), "One of y_true and X must not be None"
## If y_true is not None, we use y_true to remove the -100 in the prediction (postprocessing), and return the postprocessed y_true and prediction
# If y_true is None, we use X to compute y_is_pad (i.e., whether y_true is -100 in that position), and use y_is_pad to remove the -100 in the prediction, and return the postprocessed prediction (not the y_true)
y_predict = pd.Series(np.argmax(y_pred, axis=2).tolist())
if y_true is None:
_, y_is_pad_df = tokenize_text(
X,
y_predict,
task=task,
hf_args=hf_args,
tokenizer=tokenizer,
)
y_is_pad = y_is_pad_df.iloc[:, 0]
else:
y_is_pad = y_true
label_len = len(hf_args.label_list)
zip_pred_ispad = [
[(p, ispd) for (p, ispd) in zip(each_pred, each_is_pad) if ispd != -100]
for (each_pred, each_is_pad) in zip(y_predict, y_is_pad)
]
y_pred_label = [
[hf_args.label_list[p] if 0 <= p < label_len else -1 for (p, ispd) in each_list]
for each_list in zip_pred_ispad
] # To compute precision and recall, y_pred and y_true must be converted to string labels
# (B-PER, I-PER, etc.), so that the category-based precision/recall (i.e., PER, LOC, etc.) scores can be computed
if y_true is not None:
y_true_label = [[tr for (p, tr) in each_list] for each_list in zip_pred_ispad]
else:
y_true_label = None
return y_pred_label, y_true_label
elif task == SUMMARIZATION:
if isinstance(y_pred, tuple):
y_pred = np.argmax(y_pred[0], axis=2)
decoded_preds = tokenizer.batch_decode(y_pred, skip_special_tokens=True)
import nltk
nltk.download("punkt")
decoded_preds = [pred.strip() for pred in decoded_preds]
decoded_preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in decoded_preds]
if y_true is not None:
y_true_labels = np.where(y_true != -100, y_true, tokenizer.pad_token_id)
decoded_y_true_labels = tokenizer.batch_decode(y_true_labels, skip_special_tokens=True)
decoded_y_true_labels = [label.strip() for label in decoded_y_true_labels]
decoded_y_true_labels = ["\n".join(nltk.sent_tokenize(label)) for label in decoded_y_true_labels]
else:
decoded_y_true_labels = None
return decoded_preds, decoded_y_true_labels
elif task == MULTICHOICECLASSIFICATION:
return np.argmax(y_pred, axis=1), y_true
def load_model(checkpoint_path, task, num_labels=None):
import transformers
transformers.logging.set_verbosity_error()
from transformers import AutoConfig
from flaml.automl.task.task import (
SEQCLASSIFICATION,
SEQREGRESSION,
TOKENCLASSIFICATION,
)
def get_this_model(checkpoint_path, task, model_config):
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoModelForMultipleChoice
from transformers import AutoModelForTokenClassification
if task in (SEQCLASSIFICATION, SEQREGRESSION):
return AutoModelForSequenceClassification.from_pretrained(
checkpoint_path, config=model_config, ignore_mismatched_sizes=True
)
elif task == TOKENCLASSIFICATION:
return AutoModelForTokenClassification.from_pretrained(checkpoint_path, config=model_config)
elif task in NLG_TASKS:
return AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path, config=model_config)
elif task == MULTICHOICECLASSIFICATION:
return AutoModelForMultipleChoice.from_pretrained(checkpoint_path, config=model_config)
def _set_model_config(checkpoint_path):
if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):
model_config = AutoConfig.from_pretrained(
checkpoint_path,
num_labels=model_config_num_labels,
)
return model_config
else:
model_config = AutoConfig.from_pretrained(checkpoint_path)
return model_config
current_config = AutoConfig.from_pretrained(checkpoint_path)
this_vocab_size = current_config.vocab_size
model_config_num_labels = num_labels
new_config = _set_model_config(checkpoint_path)
this_model = get_this_model(checkpoint_path, task, new_config)
this_model.resize_token_embeddings(this_vocab_size)
return this_model

View File

@ -1,108 +0,0 @@
from typing import Dict, Any
import numpy as np
from flaml.automl.task.task import (
SUMMARIZATION,
SEQREGRESSION,
SEQCLASSIFICATION,
MULTICHOICECLASSIFICATION,
TOKENCLASSIFICATION,
)
def load_default_huggingface_metric_for_task(task):
if task == SEQCLASSIFICATION:
return "accuracy"
elif task == SEQREGRESSION:
return "r2"
elif task == SUMMARIZATION:
return "rouge1"
elif task == MULTICHOICECLASSIFICATION:
return "accuracy"
elif task == TOKENCLASSIFICATION:
return "seqeval"
def is_a_list_of_str(this_obj):
return (isinstance(this_obj, list) or isinstance(this_obj, np.ndarray)) and all(
isinstance(x, str) for x in this_obj
)
def _clean_value(value: Any) -> str:
if isinstance(value, float):
return "{:.5}".format(value)
else:
return str(value).replace("/", "_")
def format_vars(resolved_vars: Dict) -> str:
"""Formats the resolved variable dict into a single string."""
out = []
for path, value in sorted(resolved_vars.items()):
if path[0] in ["run", "env", "resources_per_trial"]:
continue # TrialRunner already has these in the experiment_tag
pieces = []
last_string = True
for k in path[::-1]:
if isinstance(k, int):
pieces.append(str(k))
elif last_string:
last_string = False
pieces.append(k)
pieces.reverse()
out.append(_clean_value("_".join(pieces)) + "=" + _clean_value(value))
return ",".join(out)
counter = 0
def date_str():
from datetime import datetime
return datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
def _generate_dirname(experiment_tag, trial_id):
generated_dirname = f"train_{str(trial_id)}_{experiment_tag}"
generated_dirname = generated_dirname[:130]
generated_dirname += f"_{date_str()}"
return generated_dirname.replace("/", "_")
def get_logdir_name(dirname, local_dir):
import os
local_dir = os.path.expanduser(local_dir)
logdir = os.path.join(local_dir, dirname)
return logdir
class Counter:
counter = 0
@staticmethod
def get_trial_fold_name(local_dir, trial_config, trial_id):
Counter.counter += 1
experiment_tag = "{0}_{1}".format(str(Counter.counter), format_vars(trial_config))
logdir = get_logdir_name(_generate_dirname(experiment_tag, trial_id=trial_id), local_dir)
return logdir
class LabelEncoderforTokenClassification:
def fit_transform(self, y):
# if the labels are tokens, convert them to ids
if any(isinstance(id, str) for id in y[0]):
self.label_list = sorted(list(set().union(*y)))
self._tokenlabel_to_id = {self.label_list[id]: id for id in range(len(self.label_list))}
y = y.apply(lambda sent: [self._tokenlabel_to_id[token] for token in sent])
# if the labels are not tokens, they must be ids
else:
assert all(isinstance(id, (int, np.integer)) for id in y[0]), "The labels must either be tokens or ids"
return y
def transform(self, y):
if hasattr(self, "_tokenlabel_to_id"):
y = y.apply(lambda sent: [self._tokenlabel_to_id[token] for token in sent])
return y

View File

@ -1,32 +0,0 @@
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
try:
import pyspark
import pyspark.pandas as ps
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import DataFrame as sparkDataFrame
from pyspark.pandas import DataFrame as psDataFrame, Series as psSeries, set_option
from pyspark.util import VersionUtils
except ImportError:
class psDataFrame:
pass
F = T = ps = sparkDataFrame = psSeries = psDataFrame
_spark_major_minor_version = set_option = None
ERROR = ImportError(
"""Please run pip install flaml[spark]
and check [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
for more details about installing Spark."""
)
else:
ERROR = None
_spark_major_minor_version = VersionUtils.majorMinorVersion(pyspark.__version__)
try:
import pandas as pd
from pandas import DataFrame, Series
except ImportError:
DataFrame = Series = pd = None

View File

@ -1,97 +0,0 @@
ParamList_LightGBM_Base = [
"baggingFraction",
"baggingFreq",
"baggingSeed",
"binSampleCount",
"boostFromAverage",
"boostingType",
"catSmooth",
"categoricalSlotIndexes",
"categoricalSlotNames",
"catl2",
"chunkSize",
"dataRandomSeed",
"defaultListenPort",
"deterministic",
"driverListenPort",
"dropRate",
"dropSeed",
"earlyStoppingRound",
"executionMode",
"extraSeed" "featureFraction",
"featureFractionByNode",
"featureFractionSeed",
"featuresCol",
"featuresShapCol",
"fobj" "improvementTolerance",
"initScoreCol",
"isEnableSparse",
"isProvideTrainingMetric",
"labelCol",
"lambdaL1",
"lambdaL2",
"leafPredictionCol",
"learningRate",
"matrixType",
"maxBin",
"maxBinByFeature",
"maxCatThreshold",
"maxCatToOnehot",
"maxDeltaStep",
"maxDepth",
"maxDrop",
"metric",
"microBatchSize",
"minDataInLeaf",
"minDataPerBin",
"minDataPerGroup",
"minGainToSplit",
"minSumHessianInLeaf",
"modelString",
"monotoneConstraints",
"monotoneConstraintsMethod",
"monotonePenalty",
"negBaggingFraction",
"numBatches",
"numIterations",
"numLeaves",
"numTasks",
"numThreads",
"objectiveSeed",
"otherRate",
"parallelism",
"passThroughArgs",
"posBaggingFraction",
"predictDisableShapeCheck",
"predictionCol",
"repartitionByGroupingColumn",
"seed",
"skipDrop",
"slotNames",
"timeout",
"topK",
"topRate",
"uniformDrop",
"useBarrierExecutionMode",
"useMissing",
"useSingleDatasetMode",
"validationIndicatorCol",
"verbosity",
"weightCol",
"xGBoostDartMode",
"zeroAsMissing",
"objective",
]
ParamList_LightGBM_Classifier = ParamList_LightGBM_Base + [
"isUnbalance",
"probabilityCol",
"rawPredictionCol",
"thresholds",
]
ParamList_LightGBM_Regressor = ParamList_LightGBM_Base + ["tweedieVariancePower"]
ParamList_LightGBM_Ranker = ParamList_LightGBM_Base + [
"groupCol",
"evalAt",
"labelGain",
"maxPosition",
]

View File

@ -1,212 +0,0 @@
import numpy as np
from typing import Union
from flaml.automl.spark import psSeries, F
from pyspark.ml.evaluation import (
BinaryClassificationEvaluator,
RegressionEvaluator,
MulticlassClassificationEvaluator,
MultilabelClassificationEvaluator,
RankingEvaluator,
)
def ps_group_counts(groups: Union[psSeries, np.ndarray]) -> np.ndarray:
if isinstance(groups, np.ndarray):
_, i, c = np.unique(groups, return_counts=True, return_index=True)
else:
i = groups.drop_duplicates().index.values
c = groups.value_counts().sort_index().to_numpy()
return c[np.argsort(i)].tolist()
def _process_df(df, label_col, prediction_col):
df = df.withColumn(label_col, F.array([df[label_col]]))
df = df.withColumn(prediction_col, F.array([df[prediction_col]]))
return df
def _compute_label_from_probability(df, probability_col, prediction_col):
# array_max finds the maximum value in the 'probability' array
# array_position finds the index of the maximum value in the 'probability' array
max_index_expr = F.expr(f"array_position({probability_col}, array_max({probability_col}))-1")
# Create a new column 'prediction' based on the maximum probability value
df = df.withColumn(prediction_col, max_index_expr.cast("double"))
return df
def spark_metric_loss_score(
metric_name: str,
y_predict: psSeries,
y_true: psSeries,
sample_weight: psSeries = None,
groups: psSeries = None,
) -> float:
"""
Compute the loss score of a metric for spark models.
Args:
metric_name: str | the name of the metric.
y_predict: psSeries | the predicted values.
y_true: psSeries | the true values.
sample_weight: psSeries | the sample weights. Default: None.
groups: psSeries | the group of each row. Default: None.
Returns:
float | the loss score. A lower value indicates a better model.
"""
import warnings
warnings.filterwarnings("ignore")
label_col = "label"
prediction_col = "prediction"
kwargs = {}
y_predict.name = prediction_col
y_true.name = label_col
df = y_predict.to_frame().join(y_true)
if sample_weight is not None:
sample_weight.name = "weight"
df = df.join(sample_weight)
kwargs = {"weightCol": "weight"}
df = df.to_spark()
metric_name = metric_name.lower()
min_mode_metrics = ["log_loss", "rmse", "mse", "mae"]
if metric_name == "rmse":
evaluator = RegressionEvaluator(
metricName="rmse",
labelCol=label_col,
predictionCol=prediction_col,
**kwargs,
)
elif metric_name == "mse":
evaluator = RegressionEvaluator(
metricName="mse",
labelCol=label_col,
predictionCol=prediction_col,
**kwargs,
)
elif metric_name == "mae":
evaluator = RegressionEvaluator(
metricName="mae",
labelCol=label_col,
predictionCol=prediction_col,
**kwargs,
)
elif metric_name == "r2":
evaluator = RegressionEvaluator(
metricName="r2",
labelCol=label_col,
predictionCol=prediction_col,
**kwargs,
)
elif metric_name == "var":
evaluator = RegressionEvaluator(
metricName="var",
labelCol=label_col,
predictionCol=prediction_col,
**kwargs,
)
elif metric_name == "roc_auc":
evaluator = BinaryClassificationEvaluator(
metricName="areaUnderROC",
labelCol=label_col,
rawPredictionCol=prediction_col,
**kwargs,
)
elif metric_name == "pr_auc":
evaluator = BinaryClassificationEvaluator(
metricName="areaUnderPR",
labelCol=label_col,
rawPredictionCol=prediction_col,
**kwargs,
)
elif metric_name == "accuracy":
evaluator = MulticlassClassificationEvaluator(
metricName="accuracy",
labelCol=label_col,
predictionCol=prediction_col,
**kwargs,
)
elif metric_name == "log_loss":
# For log_loss, prediction_col should be probability, and we need to convert it to label
df = _compute_label_from_probability(df, prediction_col, prediction_col + "_label")
evaluator = MulticlassClassificationEvaluator(
metricName="logLoss",
labelCol=label_col,
predictionCol=prediction_col + "_label",
probabilityCol=prediction_col,
**kwargs,
)
elif metric_name == "f1":
evaluator = MulticlassClassificationEvaluator(
metricName="f1",
labelCol=label_col,
predictionCol=prediction_col,
**kwargs,
)
elif metric_name == "micro_f1":
evaluator = MultilabelClassificationEvaluator(
metricName="microF1Measure",
labelCol=label_col,
predictionCol=prediction_col,
**kwargs,
)
elif metric_name == "macro_f1":
evaluator = MultilabelClassificationEvaluator(
metricName="f1MeasureByLabel",
labelCol=label_col,
predictionCol=prediction_col,
**kwargs,
)
elif metric_name == "ap":
evaluator = RankingEvaluator(
metricName="meanAveragePrecision",
labelCol=label_col,
predictionCol=prediction_col,
)
elif "ndcg" in metric_name:
# TODO: check if spark.ml ranker has the same format with
# synapseML ranker, may need to adjust the format of df
if "@" in metric_name:
k = int(metric_name.split("@", 1)[-1])
if groups is None:
evaluator = RankingEvaluator(
metricName="ndcgAtK",
labelCol=label_col,
predictionCol=prediction_col,
k=k,
)
df = _process_df(df, label_col, prediction_col)
score = 1 - evaluator.evaluate(df)
else:
counts = ps_group_counts(groups)
score = 0
psum = 0
for c in counts:
y_true_ = y_true[psum : psum + c]
y_predict_ = y_predict[psum : psum + c]
df = y_true_.to_frame().join(y_predict_).to_spark()
df = _process_df(df, label_col, prediction_col)
evaluator = RankingEvaluator(
metricName="ndcgAtK",
labelCol=label_col,
predictionCol=prediction_col,
k=k,
)
score -= evaluator.evaluate(df)
psum += c
score /= len(counts)
score += 1
else:
evaluator = RankingEvaluator(metricName="ndcgAtK", labelCol=label_col, predictionCol=prediction_col)
df = _process_df(df, label_col, prediction_col)
score = 1 - evaluator.evaluate(df)
return score
else:
raise ValueError(f"Unknown metric name: {metric_name} for spark models.")
return evaluator.evaluate(df) if metric_name in min_mode_metrics else 1 - evaluator.evaluate(df)

View File

@ -1,255 +0,0 @@
import logging
from typing import Union, List, Optional, Tuple
import numpy as np
from flaml.automl.spark import (
sparkDataFrame,
ps,
F,
T,
psDataFrame,
psSeries,
_spark_major_minor_version,
DataFrame,
Series,
set_option,
)
logger = logging.getLogger(__name__)
logger_formatter = logging.Formatter(
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
)
logger.propagate = False
def to_pandas_on_spark(
df: Union[DataFrame, sparkDataFrame, Series, psDataFrame, psSeries],
index_col: Optional[str] = None,
default_index_type: Optional[str] = "distributed-sequence",
) -> Union[psDataFrame, psSeries]:
"""Convert pandas or pyspark dataframe/series to pandas_on_Spark dataframe/series.
Args:
df: pandas.DataFrame/series or pyspark dataframe | The input dataframe/series.
index_col: str, optional | The column name to use as index, default None.
default_index_type: str, optional | The default index type, default "distributed-sequence".
Returns:
pyspark.pandas.DataFrame/Series: The converted pandas-on-Spark dataframe/series.
```python
import pandas as pd
from flaml.automl.spark.utils import to_pandas_on_spark
pdf = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
psdf = to_pandas_on_spark(pdf)
print(psdf)
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sdf = spark.createDataFrame(pdf)
psdf = to_pandas_on_spark(sdf)
print(psdf)
pds = Series([1, 2, 3])
pss = to_pandas_on_spark(pds)
print(pss)
```
"""
set_option("compute.default_index_type", default_index_type)
if isinstance(df, (DataFrame, Series)):
return ps.from_pandas(df)
elif isinstance(df, sparkDataFrame):
if _spark_major_minor_version[0] == 3 and _spark_major_minor_version[1] < 3:
return df.to_pandas_on_spark(index_col=index_col)
else:
return df.pandas_api(index_col=index_col)
elif isinstance(df, (psDataFrame, psSeries)):
return df
else:
raise TypeError(f"{type(df)} is not one of pandas.DataFrame, pandas.Series and pyspark.sql.DataFrame")
def train_test_split_pyspark(
df: Union[sparkDataFrame, psDataFrame],
stratify_column: Optional[str] = None,
test_fraction: Optional[float] = 0.2,
seed: Optional[int] = 1234,
to_pandas_spark: Optional[bool] = True,
index_col: Optional[str] = "tmp_index_col",
) -> Tuple[Union[sparkDataFrame, psDataFrame], Union[sparkDataFrame, psDataFrame]]:
"""Split a pyspark dataframe into train and test dataframes.
Args:
df: pyspark.sql.DataFrame | The input dataframe.
stratify_column: str | The column name to stratify the split. Default None.
test_fraction: float | The fraction of the test data. Default 0.2.
seed: int | The random seed. Default 1234.
to_pandas_spark: bool | Whether to convert the output to pandas_on_spark. Default True.
index_col: str | The column name to use as index. Default None.
Returns:
pyspark.sql.DataFrame/pandas_on_spark DataFrame | The train dataframe.
pyspark.sql.DataFrame/pandas_on_spark DataFrame | The test dataframe.
"""
import warnings
warnings.filterwarnings("ignore")
if isinstance(df, psDataFrame):
df = df.to_spark(index_col=index_col)
if stratify_column:
# Test data
test_fraction_dict = (
df.select(stratify_column).distinct().withColumn("fraction", F.lit(test_fraction)).rdd.collectAsMap()
)
df_test = df.stat.sampleBy(stratify_column, test_fraction_dict, seed)
# Train data
df_train = df.subtract(df_test)
else:
df_train, df_test = df.randomSplit([1 - test_fraction, test_fraction], seed)
if to_pandas_spark:
df_train = to_pandas_on_spark(df_train, index_col=index_col)
df_test = to_pandas_on_spark(df_test, index_col=index_col)
df_train.index.name = None
df_test.index.name = None
elif index_col == "tmp_index_col":
df_train = df_train.drop(index_col)
df_test = df_test.drop(index_col)
return [df_train, df_test]
def unique_pandas_on_spark(psds: Union[psSeries, psDataFrame]) -> Tuple[np.ndarray, np.ndarray]:
"""Get the unique values and counts of a pandas_on_spark series."""
if isinstance(psds, psDataFrame):
psds = psds.iloc[:, 0]
_tmp = psds.value_counts().to_pandas()
label_set = _tmp.index.values
counts = _tmp.values
return label_set, counts
def len_labels(y: Union[psSeries, np.ndarray], return_labels=False) -> Union[int, Optional[np.ndarray]]:
"""Get the number of unique labels in y."""
if not isinstance(y, (psDataFrame, psSeries)):
labels = np.unique(y)
else:
labels = y.unique() if isinstance(y, psSeries) else y.iloc[:, 0].unique()
if return_labels:
return len(labels), labels
return len(labels)
def unique_value_first_index(y: Union[Series, psSeries, np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
"""Get the unique values and indices of a pandas series,
pandas_on_spark series or numpy array."""
if isinstance(y, psSeries):
y_unique = y.drop_duplicates().sort_index()
label_set = y_unique.values
first_index = y_unique.index.values
else:
label_set, first_index = np.unique(y, return_index=True)
return label_set, first_index
def iloc_pandas_on_spark(
psdf: Union[psDataFrame, psSeries, DataFrame, Series],
index: Union[int, slice, list],
index_col: Optional[str] = "tmp_index_col",
) -> Union[psDataFrame, psSeries]:
"""Get the rows of a pandas_on_spark dataframe/series by index."""
import warnings
warnings.filterwarnings("ignore")
if isinstance(psdf, (DataFrame, Series)):
return psdf.iloc[index]
if isinstance(index, (int, slice)):
if isinstance(psdf, psSeries):
return psdf.iloc[index]
else:
return psdf.iloc[index, :]
elif isinstance(index, list):
if isinstance(psdf, psSeries):
sdf = psdf.to_frame().to_spark(index_col=index_col)
else:
if index_col not in psdf.columns:
sdf = psdf.to_spark(index_col=index_col)
else:
sdf = psdf.to_spark()
sdfiloc = sdf.filter(F.col(index_col).isin(index))
psdfiloc = to_pandas_on_spark(sdfiloc)
if isinstance(psdf, psSeries):
psdfiloc = psdfiloc[psdfiloc.columns.drop(index_col)[0]]
elif index_col not in psdf.columns:
psdfiloc = psdfiloc.drop(columns=[index_col])
return psdfiloc
else:
raise TypeError(f"{type(index)} is not one of int, slice and list for pandas_on_spark iloc")
def spark_kFold(
dataset: Union[sparkDataFrame, psDataFrame],
nFolds: int = 3,
foldCol: str = "",
seed: int = 42,
index_col: Optional[str] = "tmp_index_col",
) -> List[Tuple[psDataFrame, psDataFrame]]:
"""Generate k-fold splits for a Spark DataFrame.
Adopted from https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/tuning.html#CrossValidator
Args:
dataset: sparkDataFrame / psDataFrame. | The DataFrame to split.
nFolds: int | The number of folds. Default is 3.
foldCol: str | The column name to use for fold numbers. If not specified,
the DataFrame will be randomly split. Default is "".
The same group will not appear in two different folds (the number of
distinct groups has to be at least equal to the number of folds).
The folds are approximately balanced in the sense that the number of
distinct groups is approximately the same in each fold.
seed: int | The random seed. Default is 42.
index_col: str | The name of the index column. Default is "tmp_index_col".
Returns:
A list of (train, validation) DataFrames.
"""
import warnings
warnings.filterwarnings("ignore")
if isinstance(dataset, psDataFrame):
dataset = dataset.to_spark(index_col=index_col)
datasets = []
if not foldCol:
# Do random k-fold split.
h = 1.0 / nFolds
randCol = f"rand_col_{seed}"
df = dataset.select("*", F.rand(seed).alias(randCol))
for i in range(nFolds):
validateLB = i * h
validateUB = (i + 1) * h
condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
validation = to_pandas_on_spark(df.filter(condition), index_col=index_col)
train = to_pandas_on_spark(df.filter(~condition), index_col=index_col)
datasets.append((train.drop(columns=[randCol]), validation.drop(columns=[randCol])))
else:
# Use user-specified fold column
def get_fold_num(foldNum: int) -> int:
return int(foldNum % nFolds)
get_fold_num_udf = F.UserDefinedFunction(get_fold_num, T.IntegerType())
for i in range(nFolds):
training = dataset.filter(get_fold_num_udf(dataset[foldCol]) != F.lit(i))
validation = dataset.filter(get_fold_num_udf(dataset[foldCol]) == F.lit(i))
if training.rdd.getNumPartitions() == 0 or len(training.take(1)) == 0:
raise ValueError("The training data at fold %s is empty." % i)
if validation.rdd.getNumPartitions() == 0 or len(validation.take(1)) == 0:
raise ValueError("The validation data at fold %s is empty." % i)
training = to_pandas_on_spark(training, index_col=index_col)
validation = to_pandas_on_spark(validation, index_col=index_col)
datasets.append((training, validation))
return datasets

View File

@ -1,401 +0,0 @@
import inspect
import copy
import time
from typing import Any, Optional
import numpy as np
from flaml import tune
from flaml.automl.logger import logger
from flaml.automl.ml import compute_estimator, train_estimator
from flaml.automl.time_series.ts_data import TimeSeriesDataset
from flaml.automl.spark import psDataFrame, psSeries, DataFrame, Series
class SearchState:
@property
def search_space(self):
return self._search_space_domain
@property
def estimated_cost4improvement(self):
return max(
self.time_best_found - self.time_best_found_old,
self.total_time_used - self.time_best_found,
)
def valid_starting_point_one_dim(self, value_one_dim, domain_one_dim):
from flaml.tune.space import sample
"""
For each hp in the starting point, check the following 3 conditions:
(1) If the type of the starting point does not match the required type in search space, return false
(2) If the starting point is not in the required search space, return false
(3) If the search space is a value instead of domain, and the value is not equal to the starting point
Notice (2) include the case starting point not in user specified search space custom_hp
"""
if isinstance(domain_one_dim, sample.Domain):
renamed_type = list(inspect.signature(domain_one_dim.is_valid).parameters.values())[0].annotation
type_match = (
renamed_type == Any
or isinstance(value_one_dim, renamed_type)
or isinstance(value_one_dim, int)
and renamed_type is float
)
if not (type_match and domain_one_dim.is_valid(value_one_dim)):
return False
elif value_one_dim != domain_one_dim:
return False
return True
def valid_starting_point(self, starting_point, search_space):
return all(
self.valid_starting_point_one_dim(value, search_space[name].get("domain"))
for name, value in starting_point.items()
if name != "FLAML_sample_size"
)
def __init__(
self,
learner_class,
data,
task,
starting_point=None,
period=None,
custom_hp=None,
max_iter=None,
budget=None,
):
self.init_eci = learner_class.cost_relative2lgbm() if budget >= 0 else 1
self._search_space_domain = {}
self.init_config = None
self.low_cost_partial_config = {}
self.cat_hp_cost = {}
self.ls_ever_converged = False
self.learner_class = learner_class
self._budget = budget
if task.is_ts_forecast():
data_size = data.train_data.shape
search_space = learner_class.search_space(data=data, task=task, pred_horizon=period)
else:
data_size = data.shape
search_space = learner_class.search_space(data_size=data_size, task=task)
self.data_size = data_size
if custom_hp is not None:
search_space.update(custom_hp)
if isinstance(starting_point, dict):
starting_point = AutoMLState.sanitize(starting_point)
if max_iter > 1 and not self.valid_starting_point(starting_point, search_space):
# If the number of iterations is larger than 1, remove invalid point
logger.warning(
"Starting point {} removed because it is outside of the search space".format(starting_point)
)
starting_point = None
elif isinstance(starting_point, list):
starting_point = [AutoMLState.sanitize(x) for x in starting_point]
if max_iter > len(starting_point):
# If the number of starting points is no smaller than max iter, avoid the checking
starting_point_len = len(starting_point)
starting_point = [x for x in starting_point if self.valid_starting_point(x, search_space)]
if starting_point_len > len(starting_point):
logger.warning(
"Starting points outside of the search space are removed. "
f"Remaining starting points for {learner_class}: {starting_point}"
)
starting_point = starting_point or None
for name, space in search_space.items():
assert "domain" in space, f"{name}'s domain is missing in the search space spec {space}"
if space["domain"] is None:
# don't search this hp
continue
self._search_space_domain[name] = space["domain"]
if "low_cost_init_value" in space:
self.low_cost_partial_config[name] = space["low_cost_init_value"]
if "cat_hp_cost" in space:
self.cat_hp_cost[name] = space["cat_hp_cost"]
# if a starting point is provided, set the init config to be
# the starting point provided
if isinstance(starting_point, dict) and starting_point.get(name) is not None:
if self.init_config is None:
self.init_config = {}
self.init_config[name] = starting_point[name]
elif (
not isinstance(starting_point, list)
and "init_value" in space
and self.valid_starting_point_one_dim(space["init_value"], space["domain"])
):
if self.init_config is None:
self.init_config = {}
self.init_config[name] = space["init_value"]
if isinstance(starting_point, list):
self.init_config = starting_point
else:
self.init_config = [] if self.init_config is None else [self.init_config]
self._hp_names = list(self._search_space_domain.keys())
self.search_alg = None
self.best_config = None
self.best_result = None
self.best_loss = self.best_loss_old = np.inf
self.total_time_used = 0
self.total_iter = 0
self.base_eci = None
self.time_best_found = self.time_best_found_old = 0
self.time2eval_best = 0
self.time2eval_best_old = 0
self.trained_estimator = None
self.sample_size = None
self.trial_time = 0
def update(self, result, time_used):
if result:
config = result["config"]
if config and "FLAML_sample_size" in config:
self.sample_size = config["FLAML_sample_size"]
else:
self.sample_size = self.data_size[0]
obj = result["val_loss"]
metric_for_logging = result["metric_for_logging"]
time2eval = result["time_total_s"]
trained_estimator = result["trained_estimator"]
del result["trained_estimator"] # free up RAM
n_iter = (
trained_estimator
and hasattr(trained_estimator, "ITER_HP")
and trained_estimator.params.get(trained_estimator.ITER_HP)
)
if n_iter:
if "ml" in config:
config["ml"][trained_estimator.ITER_HP] = n_iter
else:
config[trained_estimator.ITER_HP] = n_iter
else:
obj, time2eval, trained_estimator = np.inf, 0.0, None
metric_for_logging = config = None
self.trial_time = time2eval
self.total_time_used += time_used if self._budget >= 0 else 1
self.total_iter += 1
if self.base_eci is None:
self.base_eci = time_used
if (obj is not None) and (obj < self.best_loss):
self.best_loss_old = self.best_loss if self.best_loss < np.inf else 2 * obj
self.best_loss = obj
self.best_result = result
self.time_best_found_old = self.time_best_found
self.time_best_found = self.total_time_used
self.iter_best_found = self.total_iter
self.best_config = config
self.best_config_sample_size = self.sample_size
self.best_config_train_time = time_used
if time2eval:
self.time2eval_best_old = self.time2eval_best
self.time2eval_best = time2eval
if self.trained_estimator and trained_estimator and self.trained_estimator != trained_estimator:
self.trained_estimator.cleanup()
if trained_estimator:
self.trained_estimator = trained_estimator
elif trained_estimator:
trained_estimator.cleanup()
self.metric_for_logging = metric_for_logging
self.val_loss, self.config = obj, config
def get_hist_config_sig(self, sample_size, config):
config_values = tuple([config[k] for k in self._hp_names if k in config])
config_sig = str(sample_size) + "_" + str(config_values)
return config_sig
def est_retrain_time(self, retrain_sample_size):
assert self.best_config_sample_size is not None, "need to first get best_config_sample_size"
return self.time2eval_best * retrain_sample_size / self.best_config_sample_size
class AutoMLState:
def prepare_sample_train_data(self, sample_size: int):
sampled_weight = groups = None
if sample_size <= self.data_size[0]:
if isinstance(self.X_train, TimeSeriesDataset):
sampled_X_train = copy.copy(self.X_train)
sampled_X_train.train_data = self.X_train.train_data.iloc[-sample_size:]
sampled_y_train = None
else:
if isinstance(self.X_train, (DataFrame, psDataFrame)):
sampled_X_train = self.X_train.iloc[:sample_size]
else:
sampled_X_train = self.X_train[:sample_size]
if isinstance(self.y_train, (Series, psSeries)):
sampled_y_train = self.y_train.iloc[:sample_size]
else:
sampled_y_train = self.y_train[:sample_size]
weight = self.fit_kwargs.get(
"sample_weight"
) # NOTE: _prepare_sample_train_data is before kwargs is updated to fit_kwargs_by_estimator
if weight is not None:
sampled_weight = (
weight.iloc[:sample_size] if isinstance(weight, (Series, psSeries)) else weight[:sample_size]
)
if self.groups is not None:
groups = (
self.groups.iloc[:sample_size]
if isinstance(self.groups, (Series, psSeries))
else self.groups[:sample_size]
)
else:
sampled_X_train = self.X_train_all
sampled_y_train = self.y_train_all
if (
"sample_weight" in self.fit_kwargs
): # NOTE: _prepare_sample_train_data is before kwargs is updated to fit_kwargs_by_estimator
sampled_weight = self.sample_weight_all
if self.groups is not None:
groups = self.groups_all
return sampled_X_train, sampled_y_train, sampled_weight, groups
@staticmethod
def _compute_with_config_base(
config_w_resource: dict,
state: "AutoMLState",
estimator: str,
is_report: bool = True,
) -> dict:
if "FLAML_sample_size" in config_w_resource:
sample_size = int(config_w_resource["FLAML_sample_size"])
else:
sample_size = state.data_size[0]
this_estimator_kwargs = state.fit_kwargs_by_estimator.get(
estimator
).copy() # NOTE: _compute_with_config_base is after kwargs is updated to fit_kwargs_by_estimator
(
sampled_X_train,
sampled_y_train,
sampled_weight,
groups,
) = state.task.prepare_sample_train_data(state, sample_size)
if sampled_weight is not None:
weight = this_estimator_kwargs["sample_weight"]
this_estimator_kwargs["sample_weight"] = sampled_weight
if groups is not None:
this_estimator_kwargs["groups"] = groups
config = config_w_resource.copy()
if "FLAML_sample_size" in config:
del config["FLAML_sample_size"]
budget = (
None
if state.time_budget < 0
else state.time_budget - state.time_from_start
if sample_size == state.data_size[0]
else (state.time_budget - state.time_from_start) / 2 * sample_size / state.data_size[0]
)
(
trained_estimator,
val_loss,
metric_for_logging,
_,
pred_time,
) = compute_estimator(
sampled_X_train,
sampled_y_train,
state.X_val,
state.y_val,
state.weight_val,
state.groups_val,
state.train_time_limit if budget is None else min(budget, state.train_time_limit or np.inf),
state.kf,
config,
state.task,
estimator,
state.eval_method,
state.metric,
state.best_loss,
state.n_jobs,
state.learner_classes.get(estimator),
state.cv_score_agg_func,
state.log_training_metric,
this_estimator_kwargs,
state.free_mem_ratio,
)
if state.retrain_final and not state.model_history:
trained_estimator.cleanup()
result = {
"pred_time": pred_time,
"wall_clock_time": time.time() - state._start_time_flag,
"metric_for_logging": metric_for_logging,
"val_loss": val_loss,
"trained_estimator": trained_estimator,
}
if sampled_weight is not None:
this_estimator_kwargs["sample_weight"] = weight
if is_report is True:
tune.report(**result)
return result
@classmethod
def sanitize(cls, config: dict) -> dict:
"""Make a config ready for passing to estimator."""
config = config.get("ml", config).copy()
config.pop("FLAML_sample_size", None)
config.pop("learner", None)
config.pop("_choice_", None)
return config
def _train_with_config(
self,
estimator: str,
config_w_resource: dict,
sample_size: Optional[int] = None,
):
if not sample_size:
sample_size = config_w_resource.get("FLAML_sample_size", len(self.y_train_all))
config = AutoMLState.sanitize(config_w_resource)
this_estimator_kwargs = self.fit_kwargs_by_estimator.get(
estimator
).copy() # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
(
sampled_X_train,
sampled_y_train,
sampled_weight,
groups,
) = self.task.prepare_sample_train_data(self, sample_size)
if sampled_weight is not None:
weight = this_estimator_kwargs[
"sample_weight"
] # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
this_estimator_kwargs[
"sample_weight"
] = sampled_weight # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
if groups is not None:
this_estimator_kwargs[
"groups"
] = groups # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
budget = None if self.time_budget < 0 else self.time_budget - self.time_from_start
estimator, train_time = train_estimator(
X_train=sampled_X_train,
y_train=sampled_y_train,
config_dic=config,
task=self.task,
estimator_name=estimator,
n_jobs=self.n_jobs,
estimator_class=self.learner_classes.get(estimator),
budget=budget,
fit_kwargs=this_estimator_kwargs, # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
eval_metric=self.metric if hasattr(self, "metric") else "train_time",
free_mem_ratio=self.free_mem_ratio,
)
if sampled_weight is not None:
this_estimator_kwargs[
"sample_weight"
] = weight # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
return estimator, train_time

View File

@ -1 +0,0 @@
from .task import Task

View File

@ -1,19 +0,0 @@
from typing import Optional, Union
import numpy as np
from flaml.automl.data import DataFrame, Series
from flaml.automl.task.task import Task, TS_FORECAST
def task_factory(
task_name: str,
X_train: Optional[Union[np.ndarray, DataFrame]] = None,
y_train: Optional[Union[np.ndarray, DataFrame, Series]] = None,
) -> Task:
from flaml.automl.task.generic_task import GenericTask
from flaml.automl.task.time_series_task import TimeSeriesTask
if task_name in TS_FORECAST:
return TimeSeriesTask(task_name, X_train, y_train)
else:
return GenericTask(task_name, X_train, y_train)

View File

@ -1,880 +0,0 @@
import logging
import time
from typing import List, Optional
import numpy as np
from flaml.automl.data import TS_TIMESTAMP_COL, concat
from flaml.automl.ml import EstimatorSubclass, get_val_loss, default_cv_score_agg_func
from flaml.automl.task.task import (
Task,
get_classification_objective,
TS_FORECAST,
TS_FORECASTPANEL,
)
from flaml.config import RANDOM_SEED
from flaml.automl.spark import ps, psDataFrame, psSeries, pd
from flaml.automl.spark.utils import (
iloc_pandas_on_spark,
spark_kFold,
train_test_split_pyspark,
unique_pandas_on_spark,
unique_value_first_index,
len_labels,
set_option,
)
try:
from scipy.sparse import issparse
except ImportError:
pass
try:
from sklearn.utils import shuffle
from sklearn.model_selection import (
train_test_split,
RepeatedStratifiedKFold,
RepeatedKFold,
GroupKFold,
TimeSeriesSplit,
GroupShuffleSplit,
StratifiedGroupKFold,
)
except ImportError:
pass
logger = logging.getLogger(__name__)
class GenericTask(Task):
@property
def estimators(self):
if self._estimators is None:
# put this into a function to avoid circular dependency
from flaml.automl.model import (
XGBoostSklearnEstimator,
XGBoostLimitDepthEstimator,
RandomForestEstimator,
LGBMEstimator,
LRL1Classifier,
LRL2Classifier,
CatBoostEstimator,
ExtraTreesEstimator,
KNeighborsEstimator,
TransformersEstimator,
TransformersEstimatorModelSelection,
SparkLGBMEstimator,
)
self._estimators = {
"xgboost": XGBoostSklearnEstimator,
"xgb_limitdepth": XGBoostLimitDepthEstimator,
"rf": RandomForestEstimator,
"lgbm": LGBMEstimator,
"lgbm_spark": SparkLGBMEstimator,
"lrl1": LRL1Classifier,
"lrl2": LRL2Classifier,
"catboost": CatBoostEstimator,
"extra_tree": ExtraTreesEstimator,
"kneighbor": KNeighborsEstimator,
"transformer": TransformersEstimator,
"transformer_ms": TransformersEstimatorModelSelection,
}
return self._estimators
def validate_data(
self,
automl,
state,
X_train_all,
y_train_all,
dataframe,
label,
X_val=None,
y_val=None,
groups_val=None,
groups=None,
):
if X_train_all is not None and y_train_all is not None:
assert isinstance(X_train_all, (np.ndarray, pd.DataFrame, psDataFrame)) or issparse(X_train_all), (
"X_train_all must be a numpy array, a pandas dataframe, "
"a Scipy sparse matrix or a pyspark.pandas dataframe."
)
assert isinstance(
y_train_all, (np.ndarray, pd.Series, psSeries)
), "y_train_all must be a numpy array, a pandas series or a pyspark.pandas series."
assert X_train_all.size != 0 and y_train_all.size != 0, "Input data must not be empty."
if isinstance(X_train_all, np.ndarray) and len(X_train_all.shape) == 1:
X_train_all = np.reshape(X_train_all, (X_train_all.size, 1))
if isinstance(y_train_all, np.ndarray):
y_train_all = y_train_all.flatten()
assert X_train_all.shape[0] == y_train_all.shape[0], "# rows in X_train must match length of y_train."
if isinstance(X_train_all, psDataFrame):
X_train_all = X_train_all.spark.cache() # cache data to improve compute speed
y_train_all = y_train_all.to_frame().spark.cache()[y_train_all.name]
logger.debug(f"X_train_all and y_train_all cached, shape of X_train_all: {X_train_all.shape}")
automl._df = isinstance(X_train_all, (pd.DataFrame, psDataFrame))
automl._nrow, automl._ndim = X_train_all.shape
if self.is_ts_forecast():
X_train_all = pd.DataFrame(X_train_all) if isinstance(X_train_all, np.ndarray) else X_train_all
X_train_all, y_train_all = self._validate_ts_data(X_train_all, y_train_all)
X, y = X_train_all, y_train_all
elif dataframe is not None and label is not None:
assert isinstance(
dataframe, (pd.DataFrame, psDataFrame)
), "dataframe must be a pandas DataFrame or a pyspark.pandas DataFrame."
assert (
label in dataframe.columns
), f"The provided label column name `{label}` doesn't exist in the provided dataframe."
if isinstance(dataframe, psDataFrame):
dataframe = dataframe.spark.cache() # cache data to improve compute speed
logger.debug(f"dataframe cached, shape of dataframe: {dataframe.shape}")
automl._df = True
if self.is_ts_forecast():
dataframe = self._validate_ts_data(dataframe)
# TODO: to support pyspark.sql.DataFrame and pure dataframe mode
X = dataframe.drop(columns=label)
automl._nrow, automl._ndim = X.shape
y = dataframe[label]
else:
raise ValueError("either X_train+y_train or dataframe+label are required")
# check the validity of input dimensions for NLP tasks, so need to check _is_nlp_task not estimator
if self.is_nlp():
from flaml.automl.nlp.utils import is_a_list_of_str
is_all_str = True
is_all_list = True
for column in X.columns:
assert X[column].dtype.name in (
"object",
"string",
), "If the task is an NLP task, X can only contain text columns"
for _, each_cell in X[column].items():
if each_cell is not None:
is_str = isinstance(each_cell, str)
is_list_of_int = isinstance(each_cell, list) and all(isinstance(x, int) for x in each_cell)
is_list_of_str = is_a_list_of_str(each_cell)
if self.is_token_classification():
assert is_list_of_str, (
"For the token-classification task, the input column needs to be a list of string,"
"instead of string, e.g., ['EU', 'rejects','German', 'call','to','boycott','British','lamb','.',].",
"For more examples, please refer to test/nlp/test_autohf_tokenclassification.py",
)
else:
assert is_str or is_list_of_int, (
"Each column of the input must either be str (untokenized) "
"or a list of integers (tokenized)"
)
is_all_str &= is_str
is_all_list &= is_list_of_int or is_list_of_str
assert is_all_str or is_all_list, (
"Currently FLAML only supports two modes for NLP: either all columns of X are string (non-tokenized), "
"or all columns of X are integer ids (tokenized)"
)
if isinstance(X, psDataFrame):
# TODO: support pyspark.pandas dataframe in DataTransformer
automl._skip_transform = True
if automl._skip_transform or issparse(X_train_all):
automl._transformer = automl._label_transformer = False
automl._X_train_all, automl._y_train_all = X, y
else:
from flaml.automl.data import DataTransformer
automl._transformer = DataTransformer()
(
automl._X_train_all,
automl._y_train_all,
) = automl._transformer.fit_transform(X, y, self)
automl._label_transformer = automl._transformer.label_transformer
if self.is_token_classification():
if hasattr(automl._label_transformer, "label_list"):
state.fit_kwargs.update({"label_list": automl._label_transformer.label_list})
elif "label_list" not in state.fit_kwargs:
for each_fit_kwargs in state.fit_kwargs_by_estimator.values():
assert (
"label_list" in each_fit_kwargs
), "For the token-classification task, you must either (1) pass token labels; or (2) pass id labels and the label list. "
"Please refer to the documentation for more details: https://microsoft.github.io/FLAML/docs/Examples/AutoML-NLP#a-simple-token-classification-example"
automl._feature_names_in_ = (
automl._X_train_all.columns.to_list() if hasattr(automl._X_train_all, "columns") else None
)
automl._sample_weight_full = state.fit_kwargs.get(
"sample_weight"
) # NOTE: _validate_data is before kwargs is updated to fit_kwargs_by_estimator
if X_val is not None and y_val is not None:
assert isinstance(X_val, (np.ndarray, pd.DataFrame, psDataFrame)) or issparse(X_train_all), (
"X_val must be None, a numpy array, a pandas dataframe, "
"a Scipy sparse matrix or a pyspark.pandas dataframe."
)
assert isinstance(y_val, (np.ndarray, pd.Series, psSeries)), (
"y_val must be None, a numpy array, a pandas series " "or a pyspark.pandas series."
)
assert X_val.size != 0 and y_val.size != 0, (
"Validation data are expected to be nonempty. " "Use None for X_val and y_val if no validation data."
)
if isinstance(y_val, np.ndarray):
y_val = y_val.flatten()
assert X_val.shape[0] == y_val.shape[0], "# rows in X_val must match length of y_val."
if automl._transformer:
state.X_val = automl._transformer.transform(X_val)
else:
state.X_val = X_val
# If it's NLG_TASKS, y_val is a pandas series containing the output sequence tokens,
# so we cannot use label_transformer.transform to process it
if automl._label_transformer:
state.y_val = automl._label_transformer.transform(y_val)
else:
state.y_val = y_val
else:
state.X_val = state.y_val = None
if groups is not None and len(groups) != automl._nrow:
# groups is given as group counts
state.groups = np.concatenate([[i] * c for i, c in enumerate(groups)])
assert len(state.groups) == automl._nrow, "the sum of group counts must match the number of examples"
state.groups_val = (
np.concatenate([[i] * c for i, c in enumerate(groups_val)]) if groups_val is not None else None
)
else:
state.groups_val = groups_val
state.groups = groups
automl.data_size_full = len(automl._y_train_all)
@staticmethod
def _split_pyspark(state, X_train_all, y_train_all, split_ratio, stratify=None):
# TODO: optimize this
set_option("compute.ops_on_diff_frames", True)
if not isinstance(y_train_all, (psDataFrame, psSeries)):
raise ValueError("y_train_all must be a pyspark.pandas dataframe or series")
df_all_in_one = X_train_all.join(y_train_all)
stratify_column = y_train_all.name if isinstance(y_train_all, psSeries) else y_train_all.columns[0]
ret_sample_weight = False
if (
"sample_weight" in state.fit_kwargs
): # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
# fit_kwargs["sample_weight"] is an numpy array
ps_sample_weight = ps.DataFrame(
state.fit_kwargs["sample_weight"],
columns=["sample_weight"],
)
df_all_in_one = df_all_in_one.join(ps_sample_weight)
ret_sample_weight = True
df_all_train, df_all_val = train_test_split_pyspark(
df_all_in_one,
None if stratify is None else stratify_column,
test_fraction=split_ratio,
seed=RANDOM_SEED,
)
columns_to_drop = [c for c in df_all_train.columns if c in [stratify_column, "sample_weight"]]
X_train = df_all_train.drop(columns_to_drop)
X_val = df_all_val.drop(columns_to_drop)
y_train = df_all_train[stratify_column]
y_val = df_all_val[stratify_column]
if ret_sample_weight:
return (
X_train,
X_val,
y_train,
y_val,
df_all_train["sample_weight"],
df_all_val["sample_weight"],
)
return X_train, X_val, y_train, y_val
@staticmethod
def _train_test_split(state, X, y, first=None, rest=None, split_ratio=0.2, stratify=None):
condition_type = isinstance(X, (psDataFrame, psSeries))
# NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
condition_param = "sample_weight" in state.fit_kwargs
if not condition_type and condition_param:
sample_weight = (
state.fit_kwargs["sample_weight"] if rest is None else state.fit_kwargs["sample_weight"][rest]
)
(
X_train,
X_val,
y_train,
y_val,
weight_train,
weight_val,
) = train_test_split(
X,
y,
sample_weight,
test_size=split_ratio,
stratify=stratify,
random_state=RANDOM_SEED,
)
if first is not None:
weight1 = state.fit_kwargs["sample_weight"][first]
state.weight_val = concat(weight1, weight_val)
state.fit_kwargs["sample_weight"] = concat(weight1, weight_train)
else:
state.weight_val = weight_val
state.fit_kwargs["sample_weight"] = weight_train
elif not condition_type and not condition_param:
X_train, X_val, y_train, y_val = train_test_split(
X,
y,
test_size=split_ratio,
stratify=stratify,
random_state=RANDOM_SEED,
)
elif condition_type and condition_param:
(
X_train,
X_val,
y_train,
y_val,
weight_train,
weight_val,
) = GenericTask._split_pyspark(state, X, y, split_ratio, stratify)
if first is not None:
weight1 = state.fit_kwargs["sample_weight"][first]
state.weight_val = concat(weight1, weight_val)
state.fit_kwargs["sample_weight"] = concat(weight1, weight_train)
else:
state.weight_val = weight_val
state.fit_kwargs["sample_weight"] = weight_train
else:
X_train, X_val, y_train, y_val = GenericTask._split_pyspark(state, X, y, split_ratio, stratify)
return X_train, X_val, y_train, y_val
def prepare_data(
self,
state,
X_train_all,
y_train_all,
auto_augment,
eval_method,
split_type,
split_ratio,
n_splits,
data_is_df,
sample_weight_full,
) -> int:
X_val, y_val = state.X_val, state.y_val
if issparse(X_val):
X_val = X_val.tocsr()
if issparse(X_train_all):
X_train_all = X_train_all.tocsr()
is_spark_dataframe = isinstance(X_train_all, (psDataFrame, psSeries))
self.is_spark_dataframe = is_spark_dataframe
if (
self.is_classification()
and auto_augment
and state.fit_kwargs.get("sample_weight")
is None # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
and split_type in ["stratified", "uniform"]
and not self.is_token_classification()
):
# logger.info(f"label {pd.unique(y_train_all)}")
if is_spark_dataframe:
label_set, counts = unique_pandas_on_spark(y_train_all)
# TODO: optimize this
set_option("compute.ops_on_diff_frames", True)
else:
label_set, counts = np.unique(y_train_all, return_counts=True)
# augment rare classes
rare_threshld = 20
rare = counts < rare_threshld
rare_label, rare_counts = label_set[rare], counts[rare]
for i, label in enumerate(rare_label.tolist()):
count = rare_count = rare_counts[i]
rare_index = y_train_all == label
n = len(y_train_all)
while count < rare_threshld:
if data_is_df:
X_train_all = concat(X_train_all, X_train_all.iloc[:n].loc[rare_index])
else:
X_train_all = concat(X_train_all, X_train_all[:n][rare_index, :])
if isinstance(y_train_all, (pd.Series, psSeries)):
y_train_all = concat(y_train_all, y_train_all.iloc[:n].loc[rare_index])
else:
y_train_all = np.concatenate([y_train_all, y_train_all[:n][rare_index]])
count += rare_count
logger.info(f"class {label} augmented from {rare_count} to {count}")
SHUFFLE_SPLIT_TYPES = ["uniform", "stratified"]
if is_spark_dataframe:
# no need to shuffle pyspark dataframe
pass
elif split_type in SHUFFLE_SPLIT_TYPES:
if sample_weight_full is not None:
X_train_all, y_train_all, state.sample_weight_all = shuffle(
X_train_all,
y_train_all,
sample_weight_full,
random_state=RANDOM_SEED,
)
state.fit_kwargs[
"sample_weight"
] = (
state.sample_weight_all
) # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
if isinstance(state.sample_weight_all, pd.Series):
state.sample_weight_all.reset_index(drop=True, inplace=True)
else:
X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
if data_is_df:
X_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
y_train_all.reset_index(drop=True, inplace=True)
X_train, y_train = X_train_all, y_train_all
state.groups_all = state.groups
if X_val is None and eval_method == "holdout":
if split_type == "time":
assert not self.is_ts_forecast(), "For a TS forecast task, this code should never be called"
is_sample_weight = "sample_weight" in state.fit_kwargs
if not is_spark_dataframe and is_sample_weight:
(
X_train,
X_val,
y_train,
y_val,
state.fit_kwargs[
"sample_weight"
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
state.weight_val,
) = train_test_split(
X_train_all,
y_train_all,
state.fit_kwargs[
"sample_weight"
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
test_size=split_ratio,
shuffle=False,
)
elif not is_spark_dataframe and not is_sample_weight:
X_train, X_val, y_train, y_val = train_test_split(
X_train_all,
y_train_all,
test_size=split_ratio,
shuffle=False,
)
elif is_spark_dataframe and is_sample_weight:
(
X_train,
X_val,
y_train,
y_val,
state.fit_kwargs[
"sample_weight"
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
state.weight_val,
) = self._split_pyspark(state, X_train_all, y_train_all, split_ratio)
else:
X_train, X_val, y_train, y_val = self._split_pyspark(state, X_train_all, y_train_all, split_ratio)
if split_type == "group":
gss = GroupShuffleSplit(n_splits=1, test_size=split_ratio, random_state=RANDOM_SEED)
for train_idx, val_idx in gss.split(X_train_all, y_train_all, state.groups_all):
if data_is_df:
X_train = X_train_all.iloc[train_idx]
X_val = X_train_all.iloc[val_idx]
else:
X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
state.groups = state.groups_all[train_idx]
state.groups_val = state.groups_all[val_idx]
elif self.is_classification():
# for classification, make sure the labels are complete in both
# training and validation data
label_set, first = unique_value_first_index(y_train_all)
rest = []
last = 0
first.sort()
for i in range(len(first)):
rest.extend(range(last, first[i]))
last = first[i] + 1
rest.extend(range(last, len(y_train_all)))
X_first = X_train_all.iloc[first] if data_is_df else X_train_all[first]
X_rest = X_train_all.iloc[rest] if data_is_df else X_train_all[rest]
y_rest = (
y_train_all[rest]
if isinstance(y_train_all, np.ndarray)
else iloc_pandas_on_spark(y_train_all, rest)
if is_spark_dataframe
else y_train_all.iloc[rest]
)
stratify = y_rest if split_type == "stratified" else None
X_train, X_val, y_train, y_val = self._train_test_split(
state, X_rest, y_rest, first, rest, split_ratio, stratify
)
X_train = concat(X_first, X_train)
y_train = concat(label_set, y_train) if data_is_df else np.concatenate([label_set, y_train])
X_val = concat(X_first, X_val)
y_val = concat(label_set, y_val) if data_is_df else np.concatenate([label_set, y_val])
elif self.is_regression():
X_train, X_val, y_train, y_val = self._train_test_split(
state, X_train_all, y_train_all, split_ratio=split_ratio
)
state.data_size = X_train.shape
state.data_size_full = len(y_train_all)
state.X_train, state.y_train = X_train, y_train
state.X_val, state.y_val = X_val, y_val
state.X_train_all = X_train_all
state.y_train_all = y_train_all
y_train_all_size = y_train_all.size
if eval_method == "holdout":
state.kf = None
return
if split_type == "group":
# logger.info("Using GroupKFold")
assert len(state.groups_all) == y_train_all_size, "the length of groups must match the number of examples"
assert (
len_labels(state.groups_all) >= n_splits
), "the number of groups must be equal or larger than n_splits"
state.kf = GroupKFold(n_splits)
elif split_type == "stratified":
# logger.info("Using StratifiedKFold")
assert y_train_all_size >= n_splits, (
f"{n_splits}-fold cross validation" f" requires input data with at least {n_splits} examples."
)
assert y_train_all_size >= 2 * n_splits, (
f"{n_splits}-fold cross validation with metric=r2 "
f"requires input data with at least {n_splits*2} examples."
)
state.kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
elif split_type == "time":
# logger.info("Using TimeSeriesSplit")
if self.is_ts_forecast() and not self.is_ts_forecastpanel():
period = state.fit_kwargs[
"period"
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
if period * (n_splits + 1) > y_train_all_size:
n_splits = int(y_train_all_size / period - 1)
assert n_splits >= 2, (
f"cross validation for forecasting period={period}"
f" requires input data with at least {3 * period} examples."
)
logger.info(f"Using nsplits={n_splits} due to data size limit.")
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
elif self.is_ts_forecastpanel():
n_groups = len(X_train.groupby(state.fit_kwargs.get("group_ids")).size())
period = state.fit_kwargs.get("period")
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period * n_groups)
else:
state.kf = TimeSeriesSplit(n_splits=n_splits)
# state.kf = TimeSeriesSplit(n_splits=n_splits)
elif isinstance(split_type, str):
# logger.info("Using RepeatedKFold")
state.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
else:
# logger.info("Using splitter object")
state.kf = split_type
if isinstance(state.kf, (GroupKFold, StratifiedGroupKFold)):
# self._split_type is either "group", a GroupKFold object, or a StratifiedGroupKFold object
state.kf.groups = state.groups_all
def decide_split_type(
self,
split_type,
y_train_all,
fit_kwargs,
groups=None,
) -> str:
assert not self.is_ts_forecast(), "This function should never be called as part of a time-series task."
if self.name == "classification":
self.name = get_classification_objective(len_labels(y_train_all))
if not isinstance(split_type, str):
assert hasattr(split_type, "split") and hasattr(
split_type, "get_n_splits"
), "split_type must be a string or a splitter object with split and get_n_splits methods."
assert (
not isinstance(split_type, GroupKFold) or groups is not None
), "GroupKFold requires groups to be provided."
return split_type
elif self.is_classification():
assert split_type in ["auto", "stratified", "uniform", "time", "group"]
return split_type if split_type != "auto" else groups is None and "stratified" or "group"
elif self.is_regression():
assert split_type in ["auto", "uniform", "time", "group"]
return split_type if split_type != "auto" else "uniform"
elif self.is_rank():
assert groups is not None, "groups must be specified for ranking task."
assert split_type in ["auto", "group"]
return "group"
elif self.is_nlg():
assert split_type in ["auto", "uniform", "time", "group"]
return split_type if split_type != "auto" else "uniform"
def preprocess(self, X, transformer=None):
if isinstance(X, List):
try:
if isinstance(X[0], List):
X = [x for x in zip(*X)]
X = pd.DataFrame(
dict(
[
(transformer._str_columns[idx], X[idx])
if isinstance(X[0], List)
else (transformer._str_columns[idx], [X[idx]])
for idx in range(len(X))
]
)
)
except IndexError:
raise IndexError("Test data contains more columns than training data, exiting")
elif isinstance(X, int):
return X
elif isinstance(X, psDataFrame):
return X
elif issparse(X):
X = X.tocsr()
if self.is_ts_forecast():
X = pd.DataFrame(X)
if transformer:
X = transformer.transform(X)
return X
def evaluate_model_CV(
self,
config: dict,
estimator: EstimatorSubclass,
X_train_all,
y_train_all,
budget,
kf,
eval_metric,
best_val_loss,
cv_score_agg_func=None,
log_training_metric=False,
fit_kwargs: Optional[dict] = None,
free_mem_ratio=0,
):
if fit_kwargs is None:
fit_kwargs = {}
if cv_score_agg_func is None:
cv_score_agg_func = default_cv_score_agg_func
start_time = time.time()
val_loss_folds = []
log_metric_folds = []
metric = None
train_time = pred_time = 0
total_fold_num = 0
n = kf.get_n_splits()
rng = np.random.RandomState(2020)
budget_per_train = budget and budget / n
groups = None
if self.is_classification():
labels = _, labels = len_labels(y_train_all, return_labels=True)
else:
labels = fit_kwargs.get("label_list") # pass the label list on to compute the evaluation metric
if "sample_weight" in fit_kwargs:
weight = fit_kwargs["sample_weight"]
weight_val = None
else:
weight = weight_val = None
is_spark_dataframe = isinstance(X_train_all, (psDataFrame, psSeries))
if is_spark_dataframe:
dataframe = X_train_all.join(y_train_all)
if weight is not None:
dataframe = dataframe.join(weight)
if isinstance(kf, (GroupKFold, StratifiedGroupKFold)):
groups = kf.groups
dataframe = dataframe.join(groups)
kf = spark_kFold(dataframe, nFolds=n, foldCol=groups.name if groups is not None else "")
shuffle = False
else:
X_train_split, y_train_split = X_train_all, y_train_all
shuffle = getattr(kf, "shuffle", not self.is_ts_forecast())
if isinstance(kf, RepeatedStratifiedKFold):
kf = kf.split(X_train_split, y_train_split)
elif isinstance(kf, (GroupKFold, StratifiedGroupKFold)):
groups = kf.groups
kf = kf.split(X_train_split, y_train_split, groups)
shuffle = False
elif isinstance(kf, TimeSeriesSplit):
kf = kf.split(X_train_split, y_train_split)
else:
kf = kf.split(X_train_split)
for train_index, val_index in kf:
if shuffle:
train_index = rng.permutation(train_index)
if is_spark_dataframe:
# cache data to increase compute speed
X_train = train_index.spark.cache()
X_val = val_index.spark.cache()
y_train = X_train.pop(y_train_all.name)
y_val = X_val.pop(y_train_all.name)
if weight is not None:
weight_val = X_val.pop(weight.name)
fit_kwargs["sample_weight"] = X_train.pop(weight.name)
groups_val = None
elif isinstance(X_train_all, pd.DataFrame):
X_train = X_train_split.iloc[train_index]
X_val = X_train_split.iloc[val_index]
else:
X_train, X_val = X_train_split[train_index], X_train_split[val_index]
if not is_spark_dataframe:
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
if weight is not None:
fit_kwargs["sample_weight"], weight_val = (
weight[train_index],
weight[val_index],
)
if groups is not None:
fit_kwargs["groups"] = (
groups[train_index] if isinstance(groups, np.ndarray) else groups.iloc[train_index]
)
groups_val = groups[val_index] if isinstance(groups, np.ndarray) else groups.iloc[val_index]
else:
groups_val = None
estimator.cleanup()
val_loss_i, metric_i, train_time_i, pred_time_i = get_val_loss(
config,
estimator,
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
self,
labels,
budget_per_train,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
free_mem_ratio=free_mem_ratio,
)
if isinstance(metric_i, dict) and "intermediate_results" in metric_i.keys():
del metric_i["intermediate_results"]
if weight is not None:
fit_kwargs["sample_weight"] = weight
total_fold_num += 1
val_loss_folds.append(val_loss_i)
log_metric_folds.append(metric_i)
train_time += train_time_i
pred_time += pred_time_i
if is_spark_dataframe:
X_train.spark.unpersist() # uncache data to free memory
X_val.spark.unpersist() # uncache data to free memory
if budget and time.time() - start_time >= budget:
break
val_loss, metric = cv_score_agg_func(val_loss_folds, log_metric_folds)
n = total_fold_num
pred_time /= n
return val_loss, metric, train_time, pred_time
def default_estimator_list(self, estimator_list: List[str], is_spark_dataframe: bool = False) -> List[str]:
if "auto" != estimator_list:
n_estimators = len(estimator_list)
if is_spark_dataframe:
# For spark dataframe, only estimators ending with '_spark' are supported
estimator_list = [est for est in estimator_list if est.endswith("_spark")]
if len(estimator_list) == 0:
raise ValueError(
"Spark dataframes only support estimator names ending with `_spark`. Non-supported "
"estimators are removed. No estimator is left."
)
elif n_estimators != len(estimator_list):
logger.warning(
"Spark dataframes only support estimator names ending with `_spark`. Non-supported "
"estimators are removed."
)
else:
# For non-spark dataframe, only estimators not ending with '_spark' are supported
estimator_list = [est for est in estimator_list if not est.endswith("_spark")]
if len(estimator_list) == 0:
raise ValueError(
"Non-spark dataframes only support estimator names not ending with `_spark`. Non-supported "
"estimators are removed. No estimator is left."
)
elif n_estimators != len(estimator_list):
logger.warning(
"Non-spark dataframes only support estimator names not ending with `_spark`. Non-supported "
"estimators are removed."
)
return estimator_list
if self.is_rank():
estimator_list = ["lgbm", "xgboost", "xgb_limitdepth", "lgbm_spark"]
elif self.is_nlp():
estimator_list = ["transformer"]
elif self.is_ts_forecastpanel():
estimator_list = ["tft"]
else:
try:
import catboost
estimator_list = [
"lgbm",
"rf",
"catboost",
"xgboost",
"extra_tree",
"xgb_limitdepth",
"lgbm_spark",
]
except ImportError:
estimator_list = [
"lgbm",
"rf",
"xgboost",
"extra_tree",
"xgb_limitdepth",
"lgbm_spark",
]
# if self.is_ts_forecast():
# # catboost is removed because it has a `name` parameter, making it incompatible with hcrystalball
# if "catboost" in estimator_list:
# estimator_list.remove("catboost")
# if self.is_ts_forecastregression():
# try:
# import prophet
#
# estimator_list += [
# "prophet",
# "arima",
# "sarimax",
# "holt-winters",
# ]
# except ImportError:
# estimator_list += ["arima", "sarimax", "holt-winters"]
if not self.is_regression():
estimator_list += ["lrl1"]
estimator_list = [
est
for est in estimator_list
if (est.endswith("_spark") if is_spark_dataframe else not est.endswith("_spark"))
]
return estimator_list
def default_metric(self, metric: str) -> str:
if "auto" != metric:
return metric
if self.is_nlp():
from flaml.automl.nlp.utils import (
load_default_huggingface_metric_for_task,
)
return load_default_huggingface_metric_for_task(self.name)
elif self.is_binary():
return "roc_auc"
elif self.is_multiclass():
return "log_loss"
elif self.is_ts_forecast():
return "mape"
elif self.is_rank():
return "ndcg"
else:
return "r2"
@staticmethod
def prepare_sample_train_data(automlstate, sample_size):
return automlstate.prepare_sample_train_data(sample_size)

View File

@ -1,347 +0,0 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
import numpy as np
from flaml.automl.data import DataFrame, Series, psDataFrame, psSeries
if TYPE_CHECKING:
import flaml
# TODO: if your task is not specified in here, define your task as an all-capitalized word
SEQCLASSIFICATION = "seq-classification"
MULTICHOICECLASSIFICATION = "multichoice-classification"
TOKENCLASSIFICATION = "token-classification"
SEQREGRESSION = "seq-regression"
TS_FORECASTREGRESSION = (
"forecast",
"ts_forecast",
"ts_forecast_regression",
)
REGRESSION = ("regression", SEQREGRESSION, *TS_FORECASTREGRESSION)
TS_FORECASTCLASSIFICATION = "ts_forecast_classification"
TS_FORECASTPANEL = "ts_forecast_panel"
TS_FORECAST = (
*TS_FORECASTREGRESSION,
TS_FORECASTCLASSIFICATION,
TS_FORECASTPANEL,
)
CLASSIFICATION = (
"binary",
"multiclass",
"classification",
SEQCLASSIFICATION,
MULTICHOICECLASSIFICATION,
TOKENCLASSIFICATION,
TS_FORECASTCLASSIFICATION,
)
RANK = ("rank",)
SUMMARIZATION = "summarization"
NLG_TASKS = (SUMMARIZATION,)
NLU_TASKS = (
SEQREGRESSION,
SEQCLASSIFICATION,
MULTICHOICECLASSIFICATION,
TOKENCLASSIFICATION,
)
NLP_TASKS = (*NLG_TASKS, *NLU_TASKS)
def get_classification_objective(num_labels: int) -> str:
if num_labels == 2:
objective_name = "binary"
else:
objective_name = "multiclass"
return objective_name
class Task(ABC):
"""
Abstract base class for a machine learning task.
Class definitions should implement abstract methods and provide a non-empty dictionary of estimator classes.
A Task can be suitable to be used for multiple machine-learning tasks (e.g. classification or regression) or be
implemented specifically for a single one depending on the generality of data validation and model evaluation methods
implemented. The implementation of a Task may optionally use the training data and labels to determine data and task
specific details, such as in determining if a problem is single-label or multi-label.
FLAML evaluates at runtime how to behave exactly, relying on the task instance to provide implementations of
operations which vary between tasks.
"""
def __init__(
self,
task_name: str,
X_train: Optional[Union[np.ndarray, DataFrame, psDataFrame]] = None,
y_train: Optional[Union[np.ndarray, DataFrame, Series, psSeries]] = None,
):
"""Constructor.
Args:
task_name: String name for this type of task. Used when the Task can be generic and implement a number of
types of sub-task.
X_train: Optional. Some Task types may use the data shape or features to determine details of their usage,
such as in binary vs multilabel classification.
y_train: Optional. Some Task types may use the data shape or features to determine details of their usage,
such as in binary vs multilabel classification.
"""
self.name = task_name
self._estimators = None
def __str__(self) -> str:
"""Name of this task type."""
return self.name
@abstractmethod
def evaluate_model_CV(
self,
config: dict,
estimator: "flaml.automl.ml.BaseEstimator",
X_train_all: Union[np.ndarray, DataFrame, psDataFrame],
y_train_all: Union[np.ndarray, DataFrame, Series, psSeries],
budget: int,
kf,
eval_metric: str,
best_val_loss: float,
log_training_metric: bool = False,
fit_kwargs: Optional[dict] = {},
) -> Tuple[float, float, float, float]:
"""Evaluate the model using cross-validation.
Args:
config: configuration used in the evaluation of the metric.
estimator: Estimator class of the model.
X_train_all: Complete training feature data.
y_train_all: Complete training target data.
budget: Training time budget.
kf: Cross-validation index generator.
eval_metric: Metric name to be used for evaluation.
best_val_loss: Best current validation-set loss.
log_training_metric: Bool defaults False. Enables logging of the training metric.
fit_kwargs: Additional kwargs passed to the estimator's fit method.
Returns:
validation loss, metric value, train time, prediction time
"""
@abstractmethod
def validate_data(
self,
automl: "flaml.automl.automl.AutoML",
state: "flaml.automl.state.AutoMLState",
X_train_all: Union[np.ndarray, DataFrame, psDataFrame, None],
y_train_all: Union[np.ndarray, DataFrame, Series, psSeries, None],
dataframe: Union[DataFrame, None],
label: str,
X_val: Optional[Union[np.ndarray, DataFrame, psDataFrame]] = None,
y_val: Optional[Union[np.ndarray, DataFrame, Series, psSeries]] = None,
groups_val: Optional[List[str]] = None,
groups: Optional[List[str]] = None,
):
"""Validate that the data is suitable for this task type.
Args:
automl: The AutoML instance from which this task has been constructed.
state: The AutoMLState instance for this run.
X_train_all: The complete data set or None if dataframe is supplied.
y_train_all: The complete target set or None if dataframe is supplied.
dataframe: A dataframe constaining the complete data set with targets.
label: The name of the target column in dataframe.
X_val: Optional. A data set for validation.
y_val: Optional. A target vector corresponding to X_val for validation.
groups_val: Group labels (with matching length to y_val) or group counts (with sum equal to length of y_val)
for validation data. Need to be consistent with groups.
groups: Group labels (with matching length to y_train) or groups counts (with sum equal to length of y_train)
for training data.
Raises:
AssertionError: The data provided is invalid for this task type and configuration.
"""
@abstractmethod
def prepare_data(
self,
state: "flaml.automl.state.AutoMLState",
X_train_all: Union[np.ndarray, DataFrame, psDataFrame],
y_train_all: Union[np.ndarray, DataFrame, Series, psSeries, None],
auto_augment: bool,
eval_method: str,
split_type: str,
split_ratio: float,
n_splits: int,
data_is_df: bool,
sample_weight_full: Optional[List[float]] = None,
):
"""Prepare the data for fitting or inference.
Args:
automl: The AutoML instance from which this task has been constructed.
state: The AutoMLState instance for this run.
X_train_all: The complete data set or None if dataframe is supplied. Must
contain the target if y_train_all is None
y_train_all: The complete target set or None if supplied in X_train_all.
auto_augment: If true, task-specific data augmentations will be applied.
eval_method: A string of resampling strategy, one of ['auto', 'cv', 'holdout'].
split_type: str or splitter object, default="auto" | the data split type.
* A valid splitter object is an instance of a derived class of scikit-learn
[KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
and have ``split`` and ``get_n_splits`` methods with the same signatures.
Set eval_method to "cv" to use the splitter object.
* Valid str options depend on different tasks.
For classification tasks, valid choices are
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
"auto" -> uniform.
For time series forecast tasks, must be "auto" or 'time'.
For ranking task, must be "auto" or 'group'.
split_ratio: A float of the valiation data percentage for holdout.
n_splits: An integer of the number of folds for cross - validation.
data_is_df: True if the data was provided as a DataFrame else False.
sample_weight_full: A 1d arraylike of the sample weight.
Raises:
AssertionError: The configuration provided is invalid for this task type and data.
"""
@abstractmethod
def decide_split_type(
self,
split_type: str,
y_train_all: Union[np.ndarray, DataFrame, Series, psSeries, None],
fit_kwargs: dict,
groups: Optional[List[str]] = None,
) -> str:
"""Choose an appropriate data split type for this data and task.
If split_type is 'auto' then this is determined based on the task type and data.
If a specific split_type is requested then the choice is validated to be appropriate.
Args:
split_type: Either 'auto' or a task appropriate split type.
y_train_all: The complete set of targets.
fit_kwargs: Additional kwargs passed to the estimator's fit method.
groups: Optional. Group labels (with matching length to y_train) or groups counts (with sum equal to length
of y_train) for training data.
Returns:
The determined appropriate split type.
Raises:
AssertionError: The requested split_type is invalid for this task, configuration and data.
"""
@abstractmethod
def preprocess(
self,
X: Union[np.ndarray, DataFrame, psDataFrame],
transformer: Optional["flaml.automl.data.DataTransformer"] = None,
) -> Union[np.ndarray, DataFrame]:
"""Preprocess the data ready for fitting or inference with this task type.
Args:
X: The data set to process.
transformer: A DataTransformer instance to be used in processing.
Returns:
The preprocessed data set having the same type as the input.
"""
@abstractmethod
def default_estimator_list(
self,
estimator_list: Union[List[str], str] = "auto",
is_spark_dataframe: bool = False,
) -> List[str]:
"""Return the list of default estimators registered for this task type.
If 'auto' is provided then the default list is returned, else the provided list will be validated given this task
type.
Args:
estimator_list: Either 'auto' or a list of estimator names to be validated.
is_spark_dataframe: True if the data is a spark dataframe.
Returns:
A list of valid estimator names for this task type.
"""
@abstractmethod
def default_metric(self, metric: str) -> str:
"""Return the default metric for this task type.
If 'auto' is provided then the default metric for this task will be returned. Otherwise, the provided metric name
is validated for this task type.
Args:
metric: The name of a metric to be used in evaluation of models during fitting or validation.
Returns:
The default metric, or the provided metric if it is valid for this task type.
"""
def is_ts_forecast(self) -> bool:
return self.name in TS_FORECAST
def is_ts_forecastpanel(self) -> bool:
return self.name == TS_FORECASTPANEL
def is_ts_forecastregression(self) -> bool:
return self.name in TS_FORECASTREGRESSION
def is_nlp(self) -> bool:
return self.name in NLP_TASKS
def is_nlg(self) -> bool:
return self.name in NLG_TASKS
def is_classification(self) -> bool:
return self.name in CLASSIFICATION
def is_rank(self) -> bool:
return self.name in RANK
def is_binary(self) -> bool:
return self.name == "binary"
def is_seq_regression(self) -> bool:
return self.name == SEQREGRESSION
def is_seq_classification(self) -> bool:
return self.name == SEQCLASSIFICATION
def is_token_classification(self) -> bool:
return self.name == TOKENCLASSIFICATION
def is_summarization(self) -> bool:
return self.name == SUMMARIZATION
def is_multiclass(self) -> bool:
return "multiclass" in self.name
def is_regression(self) -> bool:
return self.name in REGRESSION
def __eq__(self, other: str) -> bool:
"""For backward compatibility with all the string comparisons to task"""
return self.name == other
def estimator_class_from_str(self, estimator_name: str) -> "flaml.automl.ml.BaseEstimator":
"""Determine the estimator class corresponding to the provided name.
Args:
estimator_name: Name of the desired estimator.
Returns:
The estimator class corresponding to the provided name.
Raises:
ValueError: The provided estimator_name has not been registered for this task type.
"""
if estimator_name in self.estimators:
return self.estimators[estimator_name]
else:
raise ValueError(
f"{estimator_name} is not a built-in learner for this task type, "
f"only {list(self.estimators.keys())} are supported."
"Please use AutoML.add_learner() to add a customized learner."
)

View File

@ -1,523 +0,0 @@
import logging
import time
from typing import List
import pandas as pd
import numpy as np
from scipy.sparse import issparse
from sklearn.model_selection import (
GroupKFold,
TimeSeriesSplit,
)
from flaml.automl.ml import get_val_loss, default_cv_score_agg_func
from flaml.automl.time_series.ts_data import (
TimeSeriesDataset,
DataTransformerTS,
normalize_ts_data,
)
from flaml.automl.task.task import (
Task,
get_classification_objective,
TS_FORECAST,
TS_FORECASTPANEL,
)
logger = logging.getLogger(__name__)
class TimeSeriesTask(Task):
@property
def estimators(self):
if self._estimators is None:
# put this into a function to avoid circular dependency
from flaml.automl.time_series import (
XGBoost_TS,
XGBoostLimitDepth_TS,
RF_TS,
LGBM_TS,
ExtraTrees_TS,
CatBoost_TS,
Prophet,
Orbit,
ARIMA,
SARIMAX,
TemporalFusionTransformerEstimator,
HoltWinters,
)
self._estimators = {
"xgboost": XGBoost_TS,
"xgb_limitdepth": XGBoostLimitDepth_TS,
"rf": RF_TS,
"lgbm": LGBM_TS,
"extra_tree": ExtraTrees_TS,
"arima": ARIMA,
"sarimax": SARIMAX,
"holt-winters": HoltWinters,
"catboost": CatBoost_TS,
"tft": TemporalFusionTransformerEstimator,
}
try:
from prophet import Prophet as foo
self._estimators["prophet"] = Prophet
except ImportError:
logger.info("Couldn't import Prophet, skipping")
try:
from orbit.models import DLT
self._estimators["orbit"] = Orbit
except ImportError:
logger.info("Couldn't import Prophet, skipping")
return self._estimators
# processed
def validate_data(
self,
automl,
state,
X_train_all,
y_train_all,
dataframe,
label,
X_val=None,
y_val=None,
groups_val=None,
groups=None,
):
# first beat the data into a TimeSeriesDataset shape
if isinstance(X_train_all, TimeSeriesDataset):
# in this case, we're most likely being called by another FLAML instance
# so all the preliminary cleaning has already been done
pre_data = X_train_all
val_len = len(pre_data.X_val)
else:
if label is None and dataframe is not None:
raise ValueError("If data is specified via dataframe parameter, you must also specify label")
if isinstance(y_train_all, pd.Series):
label = y_train_all.name
elif isinstance(y_train_all, np.ndarray):
label = "y" # Prophet convention
if isinstance(label, str):
target_names = [label]
else:
target_names = label
if self.time_col is None:
if isinstance(X_train_all, pd.DataFrame):
assert dataframe is None, "One of dataframe and X arguments must be None"
self.time_col = X_train_all.columns[0]
elif dataframe is not None:
assert X_train_all is None, "One of dataframe and X arguments must be None"
self.time_col = dataframe.columns[0]
else:
self.time_col = "ds"
automl._df = True
if X_train_all is not None:
assert y_train_all is not None, "If X_train_all is not None, y_train_all must also be"
assert dataframe is None, "If X_train_all is provided, dataframe must be None"
dataframe = TimeSeriesDataset.to_dataframe(X_train_all, y_train_all, target_names, self.time_col)
elif dataframe is not None:
assert label is not None, "A label or list of labels must be provided."
assert isinstance(dataframe, pd.DataFrame), "dataframe must be a pandas DataFrame"
assert label in dataframe.columns, f"{label} must a column name in dataframe"
else:
raise ValueError("Must supply either X_train_all and y_train_all, or dataframe and label")
try:
dataframe[self.time_col] = pd.to_datetime(dataframe[self.time_col])
except Exception:
raise ValueError(
f"For '{TS_FORECAST}' task, time column {self.time_col} must contain timestamp values."
)
dataframe = remove_ts_duplicates(dataframe, self.time_col)
if X_val is not None:
assert y_val is not None, "If X_val is not None, y_val must also be"
val_df = TimeSeriesDataset.to_dataframe(X_val, y_val, target_names, self.time_col)
val_len = len(val_df)
else:
val_len = 0
val_df = None
pre_data = TimeSeriesDataset(
train_data=dataframe,
time_col=self.time_col,
target_names=target_names,
test_data=val_df,
)
# TODO: should the transformer be a property of the dataset instead?
automl._transformer = DataTransformerTS(self.time_col, label)
Xt, yt = automl._transformer.fit_transform(pre_data.X_all, pre_data.y_all)
df_t = pd.concat([Xt, yt], axis=1)
data = TimeSeriesDataset(
train_data=df_t,
time_col=pre_data.time_col,
target_names=pre_data.target_names,
).move_validation_boundary(-val_len)
# now setup the properties of all the other relevant objects
# TODO: where are these used? Replace with pointers to data?
automl._X_train_all, automl._y_train_all = Xt, yt
# TODO: where are these used?
automl._nrow, automl._ndim = data.X_train.shape
# make a property instead? Or just fix the call?
automl._label_transformer = automl._transformer.label_transformer
automl._feature_names_in_ = (
automl._X_train_all.columns.to_list() if hasattr(automl._X_train_all, "columns") else None
)
self.time_col = data.time_col
self.target_names = data.target_names
automl._state.X_val = data
automl._state.X_train = data
automl._state.y_train = None
automl._state.y_val = None
if data.test_data is not None and len(data.test_data) > 0:
automl._state.X_train_all = data.move_validation_boundary(len(data.test_data))
else:
automl._state.X_train_all = data
automl._state.y_train_all = None
automl._state.data_size = data.train_data.shape
automl.data_size_full = len(data.all_data)
automl._state.groups = None
automl._sample_weight_full = None
def prepare_data(
self,
state,
X_train_all,
y_train_all,
auto_argument,
eval_method,
split_type,
split_ratio,
n_splits,
data_is_df,
sample_weight_full,
time_col=None,
):
state.kf = None
state.data_size_full = len(y_train_all)
if split_type in ["uniform", "stratified"]:
raise ValueError(f"Split type {split_type} is not valid for time series")
state.groups = None
state.groups_all = None
state.groups_val = None
ts_data = state.X_val
no_test_data = ts_data is None or ts_data.test_data is None or len(ts_data.test_data) == 0
if no_test_data and eval_method == "holdout":
# NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
period = state.fit_kwargs["period"]
if self.name == TS_FORECASTPANEL:
# TODO: move this into the TimeSeriesDataset class
X_train_all = ts_data.X_train
y_train_all = ts_data.y_train
X_train_all["time_idx"] -= X_train_all["time_idx"].min()
X_train_all["time_idx"] = X_train_all["time_idx"].astype("int")
ids = state.fit_kwargs["group_ids"].copy()
ids.append(ts_data.time_col)
ids.append("time_idx")
y_train_all = pd.DataFrame(y_train_all)
y_train_all[ids] = X_train_all[ids]
X_train_all = X_train_all.sort_values(ids)
y_train_all = y_train_all.sort_values(ids)
training_cutoff = X_train_all["time_idx"].max() - period
X_train = X_train_all[lambda x: x.time_idx <= training_cutoff]
y_train = y_train_all[lambda x: x.time_idx <= training_cutoff].drop(columns=ids)
X_val = X_train_all[lambda x: x.time_idx > training_cutoff]
y_val = y_train_all[lambda x: x.time_idx > training_cutoff].drop(columns=ids)
train_data = normalize_ts_data(
X_train,
ts_data.target_names,
ts_data.time_col,
y_train,
)
test_data = normalize_ts_data(
X_val,
ts_data.target_names,
ts_data.time_col,
y_val,
)
ts_data = TimeSeriesDataset(
train_data,
ts_data.time_col,
ts_data.target_names,
ts_data.frequency,
test_data,
)
state.X_val = ts_data
state.X_train = ts_data
else:
# if eval_method = holdout, make holdout data
num_samples = ts_data.train_data.shape[0]
assert period < num_samples, f"period={period}>#examples={num_samples}"
state.X_val = ts_data.move_validation_boundary(-period)
state.X_train = state.X_val
if eval_method != "holdout":
if self.name != TS_FORECASTPANEL:
period = state.fit_kwargs[
"period"
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
step_size = state.fit_kwargs.get("cv_step_size", period)
ts_data = state.X_train
if n_splits * step_size + 2 * period > ts_data.y_train.size:
n_splits = int((ts_data.y_train.size - 2 * period) / step_size)
assert n_splits >= 2, (
f"cross validation for forecasting period={period}"
f" requires input data with at least {2*period + 2*step_size} examples."
)
logger.info(f"Using nsplits={n_splits} due to data size limit.")
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
state.kf.step_size = step_size
else:
n_groups = ts_data.X_train.groupby(state.fit_kwargs.get("group_ids")).ngroups
period = state.fit_kwargs["period"]
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period * n_groups)
# TODO: move task detection to Task.__init__!
def decide_split_type(
self,
split_type,
y_train_all,
fit_kwargs,
groups=None,
) -> str:
# TODO: move into task creation!!!
if self.name == "classification":
self.name = get_classification_objective(len(np.unique(y_train_all)))
# TODO: do we need this?
if not isinstance(split_type, str):
assert hasattr(split_type, "split") and hasattr(
split_type, "get_n_splits"
), "split_type must be a string or a splitter object with split and get_n_splits methods."
assert (
not isinstance(split_type, GroupKFold) or groups is not None
), "GroupKFold requires groups to be provided."
return split_type
else:
assert split_type in ["auto", "time"]
assert isinstance(
fit_kwargs.get("period"),
int, # NOTE: _decide_split_type is before kwargs is updated to fit_kwargs_by_estimator
), f"missing a required integer 'period' for '{TS_FORECAST}' task."
if fit_kwargs.get("group_ids"):
# TODO (MARK) This will likely not play well with the task class
self.name = TS_FORECASTPANEL
assert isinstance(
fit_kwargs.get("group_ids"), list
), f"missing a required List[str] 'group_ids' for '{TS_FORECASTPANEL}' task."
return "time"
# TODO: merge with preprocess() below
def _preprocess(self, X, transformer=None):
if isinstance(X, List):
try:
if isinstance(X[0], List):
X = [x for x in zip(*X)]
X = pd.DataFrame(
dict(
[
(transformer._str_columns[idx], X[idx])
if isinstance(X[0], List)
else (transformer._str_columns[idx], [X[idx]])
for idx in range(len(X))
]
)
)
except IndexError:
raise IndexError("Test data contains more columns than training data, exiting")
elif isinstance(X, int):
return X
elif issparse(X):
X = X.tocsr()
if self.is_ts_forecast():
X = pd.DataFrame(X)
if transformer:
X = transformer.transform(X)
return X
def preprocess(self, X, transformer=None):
if isinstance(X, pd.DataFrame) or isinstance(X, np.ndarray) or isinstance(X, pd.Series):
X = X.copy()
X = normalize_ts_data(X, self.target_names, self.time_col)
return self._preprocess(X, transformer)
elif isinstance(X, int):
return X
else:
raise ValueError(f"unknown type of X, {X.__class__}")
def evaluate_model_CV(
self,
config,
estimator,
X_train_all,
y_train_all,
budget,
kf,
eval_metric,
best_val_loss,
cv_score_agg_func=None,
log_training_metric=False,
fit_kwargs={},
free_mem_ratio=0, # what is this for?
):
if cv_score_agg_func is None:
cv_score_agg_func = default_cv_score_agg_func
start_time = time.time()
val_loss_folds = []
log_metric_folds = []
metric = None
train_time = pred_time = 0
total_fold_num = 0
n = kf.get_n_splits()
if self.is_classification():
labels = np.unique(y_train_all)
else:
labels = fit_kwargs.get("label_list") # pass the label list on to compute the evaluation metric
ts_data = X_train_all
budget_per_train = budget / n
ts_data = X_train_all
for data in ts_data.cv_train_val_sets(kf.n_splits, kf.test_size, kf.step_size):
estimator.cleanup()
val_loss_i, metric_i, train_time_i, pred_time_i = get_val_loss(
config,
estimator,
X_train=data,
y_train=None,
X_val=data,
y_val=None,
eval_metric=eval_metric,
labels=labels,
budget=budget_per_train,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
task=self,
weight_val=None,
groups_val=None,
free_mem_ratio=free_mem_ratio,
)
if isinstance(metric_i, dict) and "intermediate_results" in metric_i:
del metric_i["intermediate_results"]
total_fold_num += 1
val_loss_folds.append(val_loss_i)
log_metric_folds.append(metric_i)
train_time += train_time_i
pred_time += pred_time_i
if time.time() - start_time >= budget:
break
val_loss, metric = cv_score_agg_func(val_loss_folds, log_metric_folds)
n = total_fold_num
pred_time /= n
return val_loss, metric, train_time, pred_time
def default_estimator_list(self, estimator_list: List[str], is_spark_dataframe: bool) -> List[str]:
assert not is_spark_dataframe, "Spark is not yet supported for time series"
# TODO: why not do this if/then in the calling function?
if "auto" != estimator_list:
return estimator_list
if self.is_ts_forecastpanel():
return ["tft"]
estimator_list = [
"lgbm",
"rf",
"xgboost",
"extra_tree",
"xgb_limitdepth",
]
# Catboost appears to be way slower than the others, don't include it by default
# try:
# import catboost
#
# estimator_list.append("catboost")
# except ImportError:
# pass
if self.is_regression():
estimator_list += ["arima", "sarimax"]
try:
import prophet
estimator_list.append("prophet")
except ImportError:
pass
return estimator_list
def default_metric(self, metric: str) -> str:
assert self.is_ts_forecast(), "If this is not a TS forecasting task, this code should never have been called"
if metric == "auto":
return "mape"
else:
return metric
@staticmethod
def prepare_sample_train_data(automlstate, sample_size):
# we take the tail, rather than the head, for compatibility with time series
shift = sample_size - len(automlstate.X_train.train_data)
sampled_X_train = automlstate.X_train.move_validation_boundary(shift)
return sampled_X_train, None, None, None
def remove_ts_duplicates(
X,
time_col,
):
"""
Assumes the targets are included
@param X:
@param time_col:
@param y:
@return:
"""
duplicates = X.duplicated()
if any(duplicates):
logger.warning("Duplicate timestamp values found in timestamp column. " f"\n{X.loc[duplicates, X][time_col]}")
X = X.drop_duplicates()
logger.warning("Removed duplicate rows based on all columns")
assert (
X[[X.columns[0]]].duplicated() is None
), "Duplicate timestamp values with different values for other columns."
return X

View File

@ -1,17 +0,0 @@
from .ts_model import (
Prophet,
Orbit,
ARIMA,
SARIMAX,
HoltWinters,
LGBM_TS,
XGBoost_TS,
RF_TS,
ExtraTrees_TS,
XGBoostLimitDepth_TS,
CatBoost_TS,
TimeSeriesEstimator,
)
from .tft import TemporalFusionTransformerEstimator
from .ts_data import TimeSeriesDataset

View File

@ -1,34 +0,0 @@
import math
import datetime
from functools import lru_cache
import pandas as pd
def monthly_fourier_features(timestamps: pd.Series, month_fourier_degree: int = 2):
if len(timestamps):
data = pd.DataFrame({"time": timestamps})
month_pos = timestamps.apply(lambda x: position_in_month(datetime.date(x.year, x.month, x.day)))
for d in range(month_fourier_degree):
data[f"cos{d+1}"] = (2 * (d + 1) * math.pi * month_pos).apply(math.cos)
data[f"sin{d + 1}"] = (2 * (d + 1) * math.pi * month_pos).apply(math.sin)
drop_cols = ["time"]
data = data.drop(columns=drop_cols)
return data
else:
columns = []
for d in range(month_fourier_degree):
columns += [f"cos{d+1}", f"sin{d + 1}"]
return pd.DataFrame(columns=columns)
@lru_cache(maxsize=4096)
def position_in_month(d: datetime.date):
prev = datetime.date(d.year, d.month, 1) - datetime.timedelta(days=1)
nxt = datetime.date(
d.year + 1 if d.month == 12 else d.year, 1 if d.month == 12 else d.month + 1, 1
) - datetime.timedelta(days=1)
delta = (d - prev).days / (nxt - prev).days
return delta

View File

@ -1,156 +0,0 @@
try:
import pandas as pd
from pandas import DataFrame, Series, to_datetime
except ImportError:
class PD:
pass
pd = PD()
pd.DataFrame = None
pd.Series = None
DataFrame = Series = None
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
def make_lag_features(X: pd.DataFrame, y: pd.Series, lags: int):
"""Transform input data X, y into autoregressive form - shift
them appropriately based on horizon and create `lags` columns.
Parameters
----------
X : pandas.DataFrame
Input features.
y : array_like, (1d)
Target vector.
horizon : int
length of X for `predict` method
Returns
-------
pandas.DataFrame
shifted dataframe with `lags` columns
"""
lag_features = []
# make sure we show y's _previous_ value to exclude data leaks
X = X.reset_index(drop=True)
X["lag_" + y.name] = y.shift(1).values
X_lag = X.copy()
for i in range(0, lags):
X_lag.columns = [f"{c}_lag_{i}" for c in X.columns]
lag_features.append(X_lag)
X_lag = X_lag.shift(1)
X_lags = pd.concat(lag_features, axis=1)
X_out = X_lags.dropna().reset_index(drop=True)
assert len(X_out) + lags == len(X)
return X_out
class SklearnWrapper:
def __init__(
self,
model_class: type,
horizon: int,
lags: int,
init_params: dict = None,
fit_params: dict = None,
pca_features: bool = False,
):
init_params = init_params if init_params else {}
self.fit_params = fit_params if fit_params else {}
self.lags = lags
self.horizon = horizon
# TODO: use multiregression where available
self.models = [model_class(**init_params) for _ in range(horizon)]
self.pca_features = pca_features
if self.pca_features:
self.norm = StandardScaler()
self.pca = None
def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs):
self._X = X
self._y = y
fit_params = {**self.fit_params, **kwargs}
X_feat = make_lag_features(X, y, self.lags)
if self.pca_features:
X_trans = self.norm.fit_transform(X_feat)
cum_expl_var = np.cumsum(PCA(svd_solver="full").fit(X_trans).explained_variance_ratio_)
self.pca = PCA(svd_solver="full", n_components=np.argmax(1 - cum_expl_var < 1e-6))
X_trans = self.pca.fit_transform(X_trans)
else:
X_trans = X_feat
for i, model in enumerate(self.models):
offset = i + self.lags
model.fit(X_trans[: len(X) - offset], y[offset:], **fit_params)
return self
def predict(self, X, X_train=None, y_train=None):
if X_train is None:
X_train = self._X
if y_train is None:
y_train = self._y
X_train = X_train.reset_index(drop=True)
X_train[self._y.name] = y_train.values
Xall = pd.concat([X_train, X], axis=0).reset_index(drop=True)
y = Xall.pop(self._y.name)
X_feat = make_lag_features(Xall[: len(X_train) + 1], y[: len(X_train) + 1], self.lags)
if self.pca_features:
X_trans = self.pca.transform(self.norm.transform(X_feat))
else:
X_trans = X_feat
# predict all horizons from the latest features vector
preds = pd.Series([m.predict(X_trans[-1:])[0] for m in self.models])
if len(preds) < len(X):
# recursive call if len(X) > trained horizon
y_train = pd.concat([y_train, preds], axis=0, ignore_index=True)
preds = pd.concat(
[
preds,
self.predict(
X=Xall[len(y_train) :],
X_train=Xall[: len(y_train)],
y_train=y_train,
),
],
axis=0,
ignore_index=True,
)
if len(preds) > len(X):
preds = preds[: len(X)]
preds.index = X.index
# TODO: do we want auto-clipping?
# return self._clip_predictions(preds)
return preds
# TODO: fix
# @staticmethod
# def _adjust_holidays(X):
# """Transform 'holiday' columns to binary feature.
#
# Parameters
# ----------
# X : pandas.DataFrame
# Input features with 'holiday' column.
#
# Returns
# -------
# pandas.DataFrame
# Holiday feature in numeric form
# """
# return X.assign(
# **{col: X[col] != "" for col in X.filter(like="_holiday_").columns}
# )

View File

@ -1,183 +0,0 @@
import time
try:
import pandas as pd
from pandas import DataFrame, Series, to_datetime
except ImportError:
class PD:
pass
pd = PD()
pd.DataFrame = None
pd.Series = None
DataFrame = Series = None
from flaml import tune
from flaml.automl.data import add_time_idx_col
from flaml.automl.time_series.ts_data import TimeSeriesDataset
from flaml.automl.time_series.ts_model import TimeSeriesEstimator
class TemporalFusionTransformerEstimator(TimeSeriesEstimator):
"""The class for tuning Temporal Fusion Transformer"""
@classmethod
def search_space(cls, data, task, pred_horizon, **params):
space = {
"gradient_clip_val": {
"domain": tune.loguniform(lower=0.01, upper=100.0),
"init_value": 0.01,
},
"hidden_size": {
"domain": tune.lograndint(lower=8, upper=512),
"init_value": 16,
},
"hidden_continuous_size": {
"domain": tune.randint(lower=1, upper=65),
"init_value": 8,
},
"attention_head_size": {
"domain": tune.randint(lower=1, upper=5),
"init_value": 4,
},
"dropout": {
"domain": tune.uniform(lower=0.1, upper=0.3),
"init_value": 0.1,
},
"learning_rate": {
"domain": tune.loguniform(lower=0.00001, upper=1.0),
"init_value": 0.001,
},
}
return space
def transform_ds(self, X_train: TimeSeriesDataset, y_train, **kwargs):
self.data = X_train.train_data
max_prediction_length = kwargs["period"]
self.max_encoder_length = kwargs["max_encoder_length"]
training_cutoff = self.data["time_idx"].max() - max_prediction_length
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
self.group_ids = kwargs["group_ids"].copy()
training = TimeSeriesDataSet(
self.data[lambda x: x.time_idx <= training_cutoff],
time_idx="time_idx",
target=X_train.target_names[0],
group_ids=self.group_ids,
min_encoder_length=kwargs.get(
"min_encoder_length", self.max_encoder_length // 2
), # keep encoder length long (as it is in the validation set)
max_encoder_length=self.max_encoder_length,
min_prediction_length=1,
max_prediction_length=max_prediction_length,
static_categoricals=kwargs.get("static_categoricals", []),
static_reals=kwargs.get("static_reals", []),
time_varying_known_categoricals=kwargs.get("time_varying_known_categoricals", []),
time_varying_known_reals=kwargs.get("time_varying_known_reals", []),
time_varying_unknown_categoricals=kwargs.get("time_varying_unknown_categoricals", []),
time_varying_unknown_reals=kwargs.get("time_varying_unknown_reals", []),
variable_groups=kwargs.get(
"variable_groups", {}
), # group of categorical variables can be treated as one variable
lags=kwargs.get("lags", {}),
target_normalizer=GroupNormalizer(
groups=kwargs["group_ids"], transformation="softplus"
), # use softplus and normalize by group
add_relative_time_idx=True,
add_target_scales=True,
add_encoder_length=True,
)
# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, self.data, predict=True, stop_randomization=True)
# create dataloaders for model
batch_size = kwargs.get("batch_size", 64)
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)
return training, train_dataloader, val_dataloader
def fit(self, X_train, y_train, budget=None, **kwargs):
import warnings
import pytorch_lightning as pl
import torch
from pytorch_forecasting import TemporalFusionTransformer
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
# a bit of monkey patching to fix the MacOS test
# all the log_prediction method appears to do is plot stuff, which ?breaks github tests
def log_prediction(*args, **kwargs):
pass
TemporalFusionTransformer.log_prediction = log_prediction
warnings.filterwarnings("ignore")
current_time = time.time()
super().fit(X_train, **kwargs)
training, train_dataloader, val_dataloader = self.transform_ds(X_train, y_train, **kwargs)
params = self.params.copy()
gradient_clip_val = params.pop("gradient_clip_val", None)
params.pop("n_jobs", None)
max_epochs = kwargs.get("max_epochs", 20)
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor() # log the learning rate
logger = TensorBoardLogger(kwargs.get("log_dir", "lightning_logs")) # logging results to a tensorboard
default_trainer_kwargs = dict(
gpus=self._kwargs.get("gpu_per_trial", [0]) if torch.cuda.is_available() else None,
max_epochs=max_epochs,
gradient_clip_val=gradient_clip_val,
callbacks=[lr_logger, early_stop_callback],
logger=logger,
)
trainer = pl.Trainer(
**default_trainer_kwargs,
)
tft = TemporalFusionTransformer.from_dataset(
training,
**params,
lstm_layers=2, # 2 is mostly optimal according to documentation
output_size=7, # 7 quantiles by default
loss=QuantileLoss(),
log_interval=10, # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
reduce_on_plateau_patience=4,
)
# fit network
trainer.fit(
tft,
train_dataloaders=train_dataloader,
val_dataloaders=val_dataloader,
)
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
train_time = time.time() - current_time
self._model = best_tft
return train_time
def predict(self, X):
ids = self.group_ids.copy()
ids.append(self.time_col)
encoder_data = self.data[lambda x: x.time_idx > x.time_idx.max() - self.max_encoder_length]
# following pytorchforecasting example, make all target values equal to the last data
last_data_cols = self.group_ids.copy()
last_data_cols.append(self.target_names[0])
last_data = self.data[lambda x: x.time_idx == x.time_idx.max()][last_data_cols]
decoder_data = X.X_val if isinstance(X, TimeSeriesDataset) else X
if "time_idx" not in decoder_data:
decoder_data = add_time_idx_col(decoder_data)
decoder_data["time_idx"] += encoder_data["time_idx"].max() + 1 - decoder_data["time_idx"].min()
decoder_data = decoder_data.merge(last_data, how="inner", on=self.group_ids)
decoder_data = decoder_data.sort_values(ids)
new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)
new_prediction_data["time_idx"] = new_prediction_data["time_idx"].astype("int")
new_raw_predictions = self._model.predict(new_prediction_data)
index = [decoder_data[idx].to_numpy() for idx in ids]
predictions = pd.Series(new_raw_predictions.numpy().ravel(), index=index)
return predictions

View File

@ -1,544 +0,0 @@
import copy
import datetime
import math
from dataclasses import dataclass, field
from typing import List, Optional, Callable, Dict, Generator, Union
import numpy as np
try:
import pandas as pd
from pandas import DataFrame, Series, to_datetime
from scipy.sparse import issparse
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from .feature import monthly_fourier_features
except ImportError:
class PD:
pass
pd = PD()
pd.DataFrame = None
pd.Series = None
DataFrame = Series = None
@dataclass
class TimeSeriesDataset:
train_data: pd.DataFrame
time_idx: str
time_col: str
target_names: List[str]
frequency: str
test_data: pd.DataFrame
time_varying_known_categoricals: List[str] = field(default_factory=lambda: [])
time_varying_known_reals: List[str] = field(default_factory=lambda: [])
time_varying_unknown_categoricals: List[str] = field(default_factory=lambda: [])
time_varying_unknown_reals: List[str] = field(default_factory=lambda: [])
def __init__(
self,
train_data: pd.DataFrame,
time_col: str,
target_names: Union[str, List[str]],
time_idx: str = "time_idx",
test_data: Optional[pd.DataFrame] = None,
):
self.train_data = train_data
self.time_col = time_col
self.time_idx = time_idx
self.target_names = [target_names] if isinstance(target_names, str) else list(target_names)
assert isinstance(self.target_names, list)
assert len(self.target_names)
self.frequency = pd.infer_freq(train_data[time_col].unique())
assert self.frequency is not None, "Only time series of regular frequency are currently supported."
float_cols = list(train_data.select_dtypes(include=["floating"]).columns)
self.time_varying_known_reals = list(set(float_cols) - set(self.target_names))
self.time_varying_known_categoricals = list(
set(train_data.columns) - set(self.time_varying_known_reals) - set(self.target_names) - {time_col}
)
if test_data is not None:
self.test_data = test_data
else:
self.test_data = pd.DataFrame(columns=self.train_data.columns)
def add_test_data(self, X: pd.DataFrame) -> "TimeSeriesDataset":
assert self.time_col in X.columns
train_data = self.all_data[self.all_data[self.time_col] < X[self.time_col].min()]
return TimeSeriesDataset(train_data, self.time_col, self.target_names, self.time_idx, X)
@staticmethod
def to_dataframe(X, y, target_names: List[str], time_col: str):
assert len(X) == len(y), "X_val and y_val must have the same length"
validate_data_basic(X, y)
# coerce them into a dataframe
val_df = normalize_ts_data(X, target_names, time_col, y)
return val_df
@property
def all_data(self):
if len(self.test_data):
return pd.concat([self.train_data, self.test_data], axis=0)
else:
return self.train_data
@property
def regressors(self):
return self.time_varying_known_categoricals + self.time_varying_known_reals
@property
def end_date(self):
test_len = 0 if self.test_data is None else len(self.test_data)
data = self.test_data if test_len else self.train_data
return data.iloc[-1][self.time_col]
def _X(self, df: pd.DataFrame):
features = [col for col in df.columns if col not in self.target_names]
return df[features]
def _y(self, df: pd.DataFrame):
if len(self.target_names) > 1:
return df[self.target_names]
else:
return df[self.target_names[0]]
@property
def X_train(self) -> pd.DataFrame:
return self._X(self.train_data)
@property
def X_val(self) -> pd.DataFrame:
return self._X(self.test_data)
@property
def X_all(self) -> pd.DataFrame:
return pd.concat([self.X_train, self.X_val], axis=0)
@property
def y_train(self) -> pd.DataFrame:
return self._y(self.train_data)
@property
def y_val(self) -> pd.DataFrame:
return self._y(self.test_data)
@property
def y_all(self) -> pd.DataFrame:
return self._y(self.all_data)
def next_scale(self) -> int:
scale_map = {"D": 7, "MS": 12}
return scale_map.get(self.frequency, 8)
def known_features_to_floats(self, train: bool, drop_first: bool = True) -> np.ndarray:
# this is a bit tricky as shapes for train and test data must match, so need to encode together
combined = pd.concat(
[
self.train_data,
self.test_data,
],
ignore_index=True,
)
cat_one_hots = pd.get_dummies(
combined[self.time_varying_known_categoricals],
columns=self.time_varying_known_categoricals,
drop_first=drop_first,
).values.astype(float)
reals = combined[self.time_varying_known_reals].values.astype(float)
both = np.concatenate([reals, cat_one_hots], axis=1)
if train:
return both[: len(self.train_data)]
else:
return both[len(self.train_data) :]
# def unique_dimension_values(self) -> np.ndarray:
# # this is the same set for train and test data, by construction
# return self.combine_dims(self.train_data).unique()
#
# def combine_dims(self, df):
# return df.apply(lambda row: tuple([row[d] for d in self.dimensions]), axis=1)
def to_univariate(self) -> Dict[str, "TimeSeriesDataset"]:
"""
Convert a multivariate TrainingData to a dict of univariate ones
@param df:
@return:
"""
train_dims = self.combine_dims(self.train_data)
test_dims = self.combine_dims(self.test_data)
out = {}
for d in train_dims.unique():
out[d] = copy.copy(self)
out[d].train_data = self.train_data[train_dims == d]
out[d].test_data = self.test_data[test_dims == d]
return out
def move_validation_boundary(self, steps: int) -> "TimeSeriesDataset":
out = copy.copy(self)
if steps > 0:
out.train_data = pd.concat([self.train_data, self.test_data[:steps]])
out.test_data = self.test_data[steps:]
elif steps < 0:
out.train_data = self.train_data[:steps]
if len(self.test_data):
out.test_data = pd.concat([self.train_data[steps:], self.test_data])
else:
out.test_data = self.train_data[steps:]
return out
def cv_train_val_sets(
self, n_splits: int, val_length: int, step_size: int
) -> Generator["TimeSeriesDataset", None, None]:
max_index = len(self.train_data) - 1
for i in range(n_splits):
out = copy.copy(self)
val_start = max_index - (n_splits - i - 1) * step_size - val_length
out.train_data = self.train_data[:val_start]
out.test_data = self.train_data[val_start : val_start + val_length]
yield out
def filter(self, filter_fun: Callable) -> "TimeSeriesDataset":
if filter_fun is None:
return self
out = copy.copy(self)
out.train_data = self.train_data[filter_fun]
out.test_data = self.test_data[filter_fun]
return out
def prettify_prediction(self, y_pred: Union[pd.DataFrame, pd.Series, np.ndarray]):
if self.test_data is not None and len(self.test_data):
assert len(y_pred) == len(self.test_data)
if isinstance(y_pred, np.ndarray):
y_pred = pd.DataFrame(data=y_pred, columns=self.target_names, index=self.test_data.index)
elif isinstance(y_pred, pd.Series):
assert len(self.target_names) == 1, "Not enough columns in y_pred"
y_pred.name = self.target_names[0]
y_pred = pd.DataFrame(y_pred)
y_pred.index = self.test_data.index
elif isinstance(y_pred, pd.DataFrame):
y_pred.index = self.test_data.index
if self.time_col not in y_pred.columns:
y_pred[self.time_col] = self.test_data[self.time_col]
else:
if isinstance(y_pred, np.ndarray):
raise ValueError("Can't enrich np.ndarray as self.test_data is None")
elif isinstance(y_pred, pd.Series):
assert len(self.target_names) == 1, "Not enough columns in y_pred"
y_pred = pd.DataFrame({self.target_names[0]: y_pred})
# TODO auto-create the timestamps for the time column instead of throwing
raise NotImplementedError("Need a non-None test_data for this to work, for now")
assert isinstance(y_pred, pd.DataFrame)
assert self.time_col in y_pred.columns
assert all([t in y_pred.columns for t in self.target_names])
return y_pred
def merge_prediction_with_target(self, y_pred: Union[pd.DataFrame, pd.Series, np.ndarray]):
y_pred = self.prettify_prediction(y_pred)
return pd.concat([self.train_data[[self.time_col] + self.target_names], y_pred], axis=0)
def enrich_dataframe(
df: Union[pd.DataFrame, pd.Series],
fourier_degree: int,
remove_constants: bool = False,
fourier_time: bool = True,
) -> pd.DataFrame:
if isinstance(df, pd.Series):
df = pd.DataFrame(df)
new_cols = []
for col in df.columns:
if df[col].dtype.name == "datetime64[ns]":
extras = monthly_fourier_features(df[col], fourier_degree)
extras.columns = [f"{col}_{c}" for c in extras.columns]
extras.index = df.index
new_cols.append(extras)
date_feat = date_feature_dict_fourier(df[col]) if fourier_time else date_feature_dict(df[col])
if remove_constants:
re_date_feat = {k: v for k, v in date_feat.items() if v.nunique(dropna=False) >= 2}
else:
re_date_feat = date_feat
date_feat = pd.DataFrame(re_date_feat, index=df.index)
new_cols.append(date_feat)
return pd.concat([df] + new_cols, axis=1, verify_integrity=True)
def enrich_dataset(
X: TimeSeriesDataset,
fourier_degree: int = 0,
remove_constants: bool = False,
fourier_time: bool = True,
) -> TimeSeriesDataset:
new_train = enrich_dataframe(X.train_data, fourier_degree, remove_constants, fourier_time)
new_test = (
None if X.test_data is None else enrich_dataframe(X.test_data, fourier_degree, remove_constants, fourier_time)
)
return TimeSeriesDataset(
train_data=new_train,
time_col=X.time_col,
target_names=X.target_names,
time_idx=X.time_idx,
test_data=new_test,
)
def date_feature_dict(timestamps: pd.Series) -> dict:
tmp_dt = timestamps.dt
column = timestamps.name
pre_columns_dict = {
# f"{column}_year": tmp_dt.year, # not stationary
f"{column}_month": tmp_dt.month,
# f"{column}_day": tmp_dt.day,# taken care of with monthly fourier features
f"{column}_hour": tmp_dt.hour,
f"{column}_minute": tmp_dt.minute,
f"{column}_second": tmp_dt.second,
f"{column}_dayofweek": tmp_dt.dayofweek,
f"{column}_dayofyear": tmp_dt.dayofyear,
f"{column}_quarter": tmp_dt.quarter,
}
new_columns_dict = {}
for k, v in pre_columns_dict.items():
new_columns_dict.update(fourier_series(v, k))
return new_columns_dict
def date_feature_dict_fourier(timestamps: pd.Series) -> dict:
tmp_dt = timestamps.dt
column = timestamps.name
pre_columns_dict = {
# f"{column}_year": tmp_dt.year, # not stationary
f"{column}_month": tmp_dt.month / 12.0,
# f"{column}_day": tmp_dt.day,# taken care of with monthly fourier features
f"{column}_hour": tmp_dt.hour / 24.0,
f"{column}_minute": tmp_dt.minute / 60.0,
f"{column}_second": tmp_dt.second / 60.0,
f"{column}_dayofweek": tmp_dt.dayofweek / 7.0,
f"{column}_dayofyear": tmp_dt.dayofyear / 366.0,
f"{column}_quarter": tmp_dt.quarter / 4.0,
}
new_columns_dict = {}
for k, v in pre_columns_dict.items():
new_columns_dict.update(fourier_series(v, k))
return new_columns_dict
def fourier_series(feature: pd.Series, name: str):
"""
Assume feature goes from 0 to 1 cyclically, transform that into Fourier
@param feature: input feature
@return: sin(2pi*feature), cos(2pi*feature)
"""
return {
name + "_sin": np.sin(2 * math.pi * feature),
name + "_cos": np.cos(2 * math.pi * feature),
}
class DataTransformerTS:
"""Transform input time series training data."""
def __init__(self, time_col: str, label: Union[str, List[str]], time_idx: str = "time_idx"):
self.time_col = time_col
self.time_idx = time_idx
self.label = label
self.cat_columns = []
self.num_columns = []
self.datetime_columns = []
self.drop_columns = []
@property
def _drop(self):
return len(self.drop_columns)
def fit(self, X: Union[DataFrame, np.array], y):
"""Fit transformer.
Args:
X: A numpy array or a pandas dataframe of training data.
y: A numpy array or a pandas series of labels.
Returns:
X: Processed numpy array or pandas dataframe of training data.
y: Processed numpy array or pandas series of labels.
"""
assert isinstance(X, DataFrame)
X = X.copy()
n = X.shape[0]
assert len(self.num_columns) == 0, "Trying to call fit() twice, something is wrong"
for column in X.columns:
# sklearn/utils/validation.py needs int/float values
if X[column].dtype.name in ("object", "category"):
if (
# drop columns where all values are the same
X[column].nunique() == 1
# this drops UID-type cols
or X[column].nunique(dropna=True) == n - X[column].isnull().sum()
):
self.drop_columns.append(column)
elif column != self.time_idx:
self.cat_columns.append(column)
elif X[column].nunique(dropna=True) < 2:
self.drop_columns.append(column)
elif X[column].dtype.name == "datetime64[ns]":
pass # these will be processed at model level,
# so they can also be done in the predict method
else:
self.num_columns.append(column)
if self.num_columns:
self.transformer = ColumnTransformer(
[
(
"continuous",
SimpleImputer(missing_values=np.nan, strategy="median"),
self.num_columns,
)
]
)
self.transformer.fit(X[self.num_columns])
else:
self.transformer = None
# TODO: revisit for multivariate series, and recast for a single df input anyway
if isinstance(y, Series):
y = y.rename(self.label)
if isinstance(y, pd.DataFrame):
ycol = y[y.columns[0]]
elif isinstance(y, pd.Series):
ycol = y
else:
raise ValueError("y must be either a pd.Series or a pd.DataFrame at this stage")
if not pd.api.types.is_numeric_dtype(ycol):
self.label_transformer = LabelEncoder()
self.label_transformer.fit(ycol)
else:
self.label_transformer = None
def transform(self, X: Union[DataFrame, np.array], y=None):
# TODO: revisit for multivariate series, and recast for a single df input anyway
if self.label_transformer is not None and y is not None:
if isinstance(y, pd.DataFrame):
ycol = y[y.columns[0]]
elif isinstance(y, pd.Series):
ycol = y
else:
raise ValueError("y must be either a pd.Series or a pd.DataFrame at this stage")
y_tr = self.label_transformer.transform(ycol)
y.iloc[:] = y_tr.reshape(y.shape)
X.drop(columns=self.drop_columns, inplace=True)
for col in self.cat_columns:
if X[col].dtype.name == "category":
if "__NAN__" not in X[col].cat.categories:
X[col] = X[col].cat.add_categories("__NAN__").fillna("__NAN__")
else:
X[col] = X[col].fillna("__NAN__")
X[col] = X[col].astype("category")
for column in self.num_columns:
X[column] = X[column].fillna(np.nan)
if self.transformer is not None:
X[self.num_columns] = self.transformer.transform(X[self.num_columns])
if y is None:
return X
return X, y
def fit_transform(self, X: Union[DataFrame, np.array], y):
self.fit(X, y)
return self.transform(X, y)
def create_forward_frame(
frequency: str,
steps: int,
test_end_date: datetime.datetime,
time_col: str,
):
start_date = test_end_date + pd.Timedelta(1, frequency)
times = pd.date_range(
start=start_date,
periods=steps,
freq=frequency,
)
return pd.DataFrame({time_col: times})
def normalize_ts_data(X_train_all, target_names, time_col, y_train_all=None):
if isinstance(X_train_all, TimeSeriesDataset):
return X_train_all
if issparse(X_train_all):
X_train_all = X_train_all.tocsr()
if isinstance(X_train_all, np.ndarray) and len(X_train_all.shape) == 1:
X_train_all = np.reshape(X_train_all, (X_train_all.size, 1))
if isinstance(X_train_all, np.ndarray):
X_train_all = pd.DataFrame(
X_train_all,
columns=[time_col] + [f"x{i}" for i in range(X_train_all.shape[1] - 1)],
)
if y_train_all is None:
return X_train_all
else:
if isinstance(y_train_all, np.ndarray):
# TODO: will need to revisit this when doing multivariate y
y_train_all = pd.DataFrame(
y_train_all.reshape(len(X_train_all), -1),
columns=target_names,
index=X_train_all.index,
)
elif isinstance(y_train_all, pd.Series):
y_train_all = pd.DataFrame(y_train_all)
y_train_all.index = X_train_all.index
dataframe = pd.concat([X_train_all, y_train_all], axis=1)
return dataframe
def validate_data_basic(X_train_all, y_train_all):
assert isinstance(X_train_all, np.ndarray) or issparse(X_train_all) or isinstance(X_train_all, pd.DataFrame), (
"X_train_all must be a numpy array, a pandas dataframe, " "or Scipy sparse matrix."
)
assert (
isinstance(y_train_all, np.ndarray)
or isinstance(y_train_all, pd.Series)
or isinstance(y_train_all, pd.DataFrame)
), "y_train_all must be a numpy array or a pandas series or DataFrame."
assert X_train_all.size != 0 and y_train_all.size != 0, "Input data must not be empty, use None if no data"
assert X_train_all.shape[0] == y_train_all.shape[0], "# rows in X_train must match length of y_train."

View File

@ -1,760 +0,0 @@
import time
import logging
import os
from datetime import datetime
import math
from typing import List, Optional, Union
try:
import pandas as pd
from pandas import DataFrame, Series, to_datetime
except ImportError:
class PD:
pass
pd = PD()
pd.DataFrame = None
pd.Series = None
DataFrame = Series = None
import numpy as np
from flaml import tune
from flaml.model import (
suppress_stdout_stderr,
SKLearnEstimator,
logger,
LGBMEstimator,
XGBoostSklearnEstimator,
RandomForestEstimator,
ExtraTreesEstimator,
XGBoostLimitDepthEstimator,
CatBoostEstimator,
)
from flaml.data import TS_TIMESTAMP_COL, TS_VALUE_COL
from flaml.automl.time_series.ts_data import (
TimeSeriesDataset,
enrich_dataset,
enrich_dataframe,
normalize_ts_data,
create_forward_frame,
)
from flaml.automl.task import Task
class TimeSeriesEstimator(SKLearnEstimator):
def __init__(self, task="ts_forecast", n_jobs=1, **params):
super().__init__(task, **params)
self.time_col: Optional[str] = None
self.target_names: Optional[Union[str, List[str]]] = None
self.frequency: Optional[str] = None
self.end_date: Optional[datetime] = None
self.regressors: Optional[List[str]] = None
def enrich(
self,
X: Union[int, TimeSeriesDataset, DataFrame],
remove_constants: bool = False,
):
X = normalize_ts_data(X, None, self.time_col, None)
if isinstance(X, int):
X = create_forward_frame(self.frequency, X, self.end_date, self.time_col)
fourier_degree = self.params.get("monthly_fourier_degree", 4)
if isinstance(X, TimeSeriesDataset):
return enrich_dataset(
X,
fourier_degree,
remove_constants=remove_constants,
fourier_time=self.params.get("fourier_time_features"),
)
return enrich_dataframe(
X,
fourier_degree,
remove_constants=remove_constants,
fourier_time=self.params.get("fourier_time_features"),
)
@classmethod
def search_space(cls, data: TimeSeriesDataset, task: Task, pred_horizon: int):
space = cls._search_space(data=data, task=task, pred_horizon=pred_horizon)
space.update(cls.top_search_space())
return space
@staticmethod
def adjust_scale(scale: int, data_len: int, pred_horizon: int):
points = data_len - pred_horizon
max_lags = math.floor(points / scale)
while scale > 2:
if max_lags >= 2:
break
scale = math.ceil(scale / 1.7)
max_lags = math.floor(points / scale)
assert scale >= 2 and max_lags >= 2, f"Too few points ({data_len}) for prediction horizon {pred_horizon}"
return scale, max_lags
@classmethod
def top_search_space(cls):
return {
"monthly_fourier_degree": {
"domain": tune.randint(lower=0, upper=8),
"init_value": 4,
"low_cost_init_value": 2,
},
"fourier_time_features": {
"domain": tune.randint(lower=0, upper=2), # tune.choice([True, False]),
"init_value": 1,
"low_cost_init_value": 0,
},
"pca_features": { # disable for now, will deal with occasional svd fail later
"domain": tune.choice([False]),
"init_value": False,
"low_cost_init_value": False,
},
}
@classmethod
def top_level_params(cls):
return ["monthly_fourier_degree"]
def _join(self, X_train, y_train):
assert TS_TIMESTAMP_COL in X_train, (
"Dataframe for training ts_forecast model must have column"
f' "{TS_TIMESTAMP_COL}" with the dates in X_train.'
)
y_train = DataFrame(y_train, columns=[TS_VALUE_COL])
train_df = X_train.join(y_train)
return train_df
def fit(self, X_train: TimeSeriesDataset, y_train=None, budget=None, **kwargs):
# TODO purge y_train
self.time_col = X_train.time_col
self.target_names = X_train.target_names
self.X_train = X_train
self.frequency = self.X_train.frequency
self.end_date = self.X_train.end_date
def score(self, X_val: DataFrame, y_val: Series, **kwargs):
from sklearn.metrics import r2_score
from ..ml import metric_loss_score
y_pred = self.predict(X_val, **kwargs)
if isinstance(X_val, TimeSeriesDataset):
y_val = X_val.test_data[X_val.target_names[0]]
self._metric = kwargs.get("metric", None)
if self._metric:
return metric_loss_score(self._metric, y_pred, y_val)
else:
return r2_score(y_pred, y_val)
class Orbit(TimeSeriesEstimator):
def fit(self, X_train: TimeSeriesDataset, y_train=None, budget=None, **kwargs):
# This may be needed to get PyStan to run, needed for Orbit
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
from orbit.models import DLT
# y_train is ignored, just need it for signature compatibility with other classes
super().fit(X_train, y_train, budget=budget, **kwargs)
current_time = time.time()
self.logger = logging.getLogger("orbit").setLevel(logging.WARNING)
model_class = self.params.get("model_class", DLT)
self._model = model_class(
response_col=X_train.target_names[0],
date_col=X_train.time_col,
regressor_col=X_train.regressors,
# TODO: infer seasonality from frequency
**self.params,
)
with suppress_stdout_stderr():
self._model.fit(df=X_train.train_data.copy())
train_time = time.time() - current_time
return train_time
def predict(self, X: Union[TimeSeriesDataset, DataFrame], **kwargs):
if isinstance(X, int):
X = create_forward_frame(
self.frequency,
X,
self.end_date,
self.time_col,
)
elif isinstance(X, TimeSeriesDataset):
data = X
X = data.test_data[[self.time_col] + X.regressors]
if self._model is not None:
forecast = self._model.predict(X, **kwargs)
out = (
DataFrame(
forecast[
[
self.time_col,
"prediction",
"prediction_5",
"prediction_95",
]
]
)
.reset_index(drop=True)
.rename(
columns={
"prediction": self.target_names[0],
}
)
)
return out
else:
self.logger.warning("Estimator is not fit yet. Please run fit() before predict().")
return None
@classmethod
def _search_space(cls, **params):
# TODO: fill in a proper search space
space = {}
return space
class Prophet(TimeSeriesEstimator):
"""The class for tuning Prophet."""
@classmethod
def _search_space(cls, **params):
space = {
"changepoint_prior_scale": {
"domain": tune.loguniform(lower=0.001, upper=0.05),
"init_value": 0.05,
"low_cost_init_value": 0.001,
},
"seasonality_prior_scale": {
"domain": tune.loguniform(lower=0.01, upper=10),
"init_value": 10,
},
"holidays_prior_scale": {
"domain": tune.loguniform(lower=0.01, upper=10),
"init_value": 10,
},
"seasonality_mode": {
"domain": tune.choice(["additive", "multiplicative"]),
"init_value": "multiplicative",
},
}
return space
def fit(self, X_train, y_train=None, budget=None, **kwargs):
from prophet import Prophet
X_train = self.enrich(X_train)
super().fit(X_train, y_train, budget=budget, **kwargs)
current_time = time.time()
if isinstance(X_train, TimeSeriesDataset):
data = X_train
target_col = data.target_names[0]
time_col = data.time_col
regressors = data.regressors
# this class only supports univariate regression
train_df = data.train_data[regressors + [target_col, time_col]]
train_df = train_df.rename(columns={target_col: "y", time_col: "ds"})
else:
train_df = self._join(X_train, y_train)
regressors = list(train_df.columns)
regressors.remove(TS_TIMESTAMP_COL)
regressors.remove(TS_VALUE_COL)
train_df = self._preprocess(train_df)
logging.getLogger("prophet").setLevel(logging.WARNING)
nice_params = {k: v for k, v in self.params.items() if k in self._search_space()}
model = Prophet(**nice_params)
for regressor in regressors:
model.add_regressor(regressor)
with suppress_stdout_stderr():
model.fit(train_df)
train_time = time.time() - current_time
self._model = model
return train_time
def predict(self, X, **kwargs):
X = self.enrich(X)
if isinstance(X, int):
raise ValueError(
"predict() with steps is only supported for arima/sarimax."
" For Prophet, pass a dataframe with the first column containing"
" the timestamp values."
)
if isinstance(X, TimeSeriesDataset):
data = X
X = data.test_data[data.regressors + [data.time_col]]
X = X.rename(columns={self.time_col: "ds"})
if self._model is not None:
X = self._preprocess(X)
forecast = self._model.predict(X, **kwargs)
out = forecast["yhat"]
out.name = self.target_names[0]
return out
else:
logger.warning("Estimator is not fit yet. Please run fit() before predict().")
return np.ones(X.shape[0])
class StatsModelsEstimator(TimeSeriesEstimator):
def predict(self, X, **kwargs) -> pd.Series:
X = self.enrich(X)
if self._model is None or self._model is False:
return np.ones(X if isinstance(X, int) else X.shape[0])
if isinstance(X, int):
return self._model.forecast(steps=X)
if isinstance(X, TimeSeriesDataset):
data = X
X = data.test_data[data.regressors + [data.time_col]]
else:
X = X[self.regressors + [self.time_col]]
if isinstance(X, DataFrame):
start = X[self.time_col].iloc[0]
end = X[self.time_col].iloc[-1]
if len(self.regressors):
exog = self._preprocess(X[self.regressors])
forecast = self._model.predict(start=start, end=end, exog=exog.values, **kwargs)
else:
forecast = self._model.predict(start=start, end=end, **kwargs)
else:
raise ValueError(
"X needs to be either a pandas Dataframe with dates as the first column"
" or an int number of periods for predict()."
)
forecast.name = self.target_names[0]
return forecast
class ARIMA(StatsModelsEstimator):
"""The class for tuning ARIMA."""
def __init__(self, **kwargs):
super().__init__(**kwargs)
if not all([p in self.params for p in ["p", "d", "q"]]):
print("arima params at init time:")
print(self.params)
try:
raise ValueError("ARIMA initialized without required params p, d, q")
except Exception as e:
import traceback
print(traceback.format_exc())
raise e
@classmethod
def _search_space(cls, data: TimeSeriesDataset, task: Task, pred_horizon: int, **params):
scale, _ = cls.adjust_scale(data.next_scale(), len(data.train_data), pred_horizon)
space = {
"p": {
"domain": tune.qrandint(lower=0, upper=2 * scale, q=1),
"init_value": scale,
"low_cost_init_value": 0,
},
"d": {
"domain": tune.qrandint(lower=0, upper=6, q=1),
"init_value": 1,
"low_cost_init_value": 0,
},
"q": {
"domain": tune.qrandint(lower=0, upper=2 * scale, q=1),
"init_value": scale,
"low_cost_init_value": 0,
},
}
return space
def _join(self, X_train, y_train):
train_df = super()._join(X_train, y_train)
train_df.index = to_datetime(train_df[TS_TIMESTAMP_COL])
train_df = train_df.drop(TS_TIMESTAMP_COL, axis=1)
return train_df
def fit(self, X_train, y_train=None, budget=None, **kwargs):
import warnings
super().fit(X_train, y_train, budget=budget, **kwargs)
X_train = self.enrich(X_train, remove_constants=True)
warnings.filterwarnings("ignore")
from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
current_time = time.time()
if isinstance(X_train, TimeSeriesDataset):
data = X_train
# this class only supports univariate regression
target_col = data.target_names[0] if isinstance(data.target_names, list) else data.target_names
self.regressors = data.regressors
train_df = data.train_data[self.regressors + [target_col]]
train_df.index = to_datetime(data.train_data[data.time_col])
self.time_col = data.time_col
self.target_names = target_col
else:
target_col = TS_VALUE_COL
train_df = self._join(X_train, y_train)
self.regressors = list(train_df)
self.regressors.remove(TS_VALUE_COL)
train_df = self._preprocess(train_df)
if len(self.regressors):
model = ARIMA_estimator(
train_df[[target_col]],
exog=train_df[self.regressors],
order=(self.params["p"], self.params["d"], self.params["q"]),
enforce_stationarity=False,
enforce_invertibility=False,
)
else:
model = ARIMA_estimator(
train_df,
order=(self.params["p"], self.params["d"], self.params["q"]),
enforce_stationarity=False,
enforce_invertibility=False,
)
with suppress_stdout_stderr():
model = model.fit()
train_time = time.time() - current_time
self._model = model
return train_time
class SARIMAX(StatsModelsEstimator):
"""The class for tuning SARIMA."""
@classmethod
def _search_space(cls, data: TimeSeriesDataset, task: Task, pred_horizon: int, **params):
scale, max_lags = cls.adjust_scale(data.next_scale(), len(data.train_data), pred_horizon)
# TODO: instead, downscale the dataset and take next_scale from that for P and Q
scales = [
s for s in [scale, 2 * scale, 3 * scale, 4 * scale] if s * max_lags <= len(data.train_data) - pred_horizon
]
space = {
"p": {
"domain": tune.qrandint(lower=0, upper=scale - 1, q=1),
"init_value": scale - 1,
"low_cost_init_value": 0,
},
"d": {
"domain": tune.qrandint(lower=0, upper=6, q=1),
"init_value": 0,
"low_cost_init_value": 0,
},
"q": {
"domain": tune.qrandint(lower=0, upper=scale - 1, q=1),
"init_value": scale - 1,
"low_cost_init_value": 0,
},
"P": {
"domain": tune.qrandint(lower=0, upper=min(10, max_lags), q=1),
"init_value": 3,
"low_cost_init_value": 0,
},
"D": {
"domain": tune.qrandint(lower=0, upper=6, q=1),
"init_value": 0,
"low_cost_init_value": 0,
},
"Q": {
"domain": tune.qrandint(lower=0, upper=min(10, max_lags), q=1),
"init_value": 3,
"low_cost_init_value": 0,
},
"s": {
"domain": tune.choice(scales),
"init_value": scale,
},
}
return space
def fit(self, X_train, y_train=None, budget=None, **kwargs):
import warnings
super().fit(X_train, y_train, budget=budget, **kwargs)
X_train = self.enrich(X_train)
warnings.filterwarnings("ignore")
from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator
current_time = time.time()
if isinstance(X_train, TimeSeriesDataset):
data = X_train
target_col = data.target_names[0]
self.regressors = data.regressors
# this class only supports univariate regression
train_df = data.train_data[self.regressors + [target_col]]
train_df.index = to_datetime(data.train_data[data.time_col])
else:
target_col = TS_VALUE_COL
train_df = self._join(X_train, y_train)
self.regressors = list(train_df)
self.regressors.remove(TS_VALUE_COL)
train_df = self._preprocess(train_df)
# regressors = list(train_df)
# regressors.remove(target_col)
if self.regressors:
model = SARIMAX_estimator(
train_df[[target_col]],
exog=train_df[self.regressors],
order=(self.params["p"], self.params["d"], self.params["q"]),
seasonal_order=(
self.params["P"],
self.params["D"],
self.params["Q"],
self.params["s"],
),
enforce_stationarity=False,
enforce_invertibility=False,
)
else:
model = SARIMAX_estimator(
train_df,
order=(self.params["p"], self.params["d"], self.params["q"]),
seasonal_order=(
self.params["P"],
self.params["D"],
self.params["Q"],
self.params["s"],
),
enforce_stationarity=False,
enforce_invertibility=False,
)
with suppress_stdout_stderr():
model = model.fit()
train_time = time.time() - current_time
self._model = model
return train_time
class HoltWinters(StatsModelsEstimator):
"""
The class for tuning Holt Winters model, aka 'Triple Exponential Smoothing'.
"""
@classmethod
def _search_space(cls, data: TimeSeriesDataset, task: Task, pred_horizon: int, **params):
space = {
"damped_trend": {"domain": tune.choice([True, False]), "init_value": False},
"trend": {"domain": tune.choice(["add", "mul", None]), "init_value": "add"},
"seasonal": {
"domain": tune.choice(["add", "mul", None]),
"init_value": "add",
},
"use_boxcox": {"domain": tune.choice([False, True]), "init_value": False},
"seasonal_periods": { # statsmodels casts this to None if "seasonal" is None
"domain": tune.choice([7, 12, 4, 52, 6]), # weekly, yearly, quarterly, weekly w yearly data
"init_value": 7,
},
}
return space
def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tsa.holtwinters import (
ExponentialSmoothing as HWExponentialSmoothing,
)
current_time = time.time()
super().fit(X_train, y_train, budget=budget, **kwargs)
X_train = self.enrich(X_train)
self.regressors = []
if isinstance(X_train, TimeSeriesDataset):
data = X_train
target_col = data.target_names[0]
regressors = data.regressors
# this class only supports univariate regression
train_df = data.train_data[self.regressors + [target_col]]
train_df.index = to_datetime(data.train_data[data.time_col])
else:
target_col = TS_VALUE_COL
train_df = self._join(X_train, y_train)
regressors = list(train_df)
regressors.remove(TS_VALUE_COL)
if regressors:
logger.warning("Regressors are ignored for Holt-Winters ETS models.")
train_df = self._preprocess(train_df)
# Override incompatible parameters
if (
train_df.shape[0] < 2 * self.params["seasonal_periods"]
): # this would prevent heuristic initialization to work properly
self.params["seasonal"] = None
if (
self.params["seasonal"] == "mul" and (train_df.y == 0).sum() > 0
): # cannot have multiplicative seasonality in this case
self.params["seasonal"] = "add"
if self.params["trend"] == "mul" and (train_df.y == 0).sum() > 0:
self.params["trend"] = "add"
if not self.params["seasonal"] or self.params["trend"] not in ["mul", "add"]:
self.params["damped_trend"] = False
model = HWExponentialSmoothing(
train_df[[target_col]],
damped_trend=self.params["damped_trend"],
seasonal=self.params["seasonal"],
trend=self.params["trend"],
)
with suppress_stdout_stderr():
model = model.fit()
train_time = time.time() - current_time
self._model = model
return train_time
class TS_SKLearn(TimeSeriesEstimator):
"""The class for tuning SKLearn Regressors for time-series forecasting"""
base_class = SKLearnEstimator
@classmethod
def _search_space(cls, data: TimeSeriesDataset, task: Task, pred_horizon: int, **params):
data_size = data.train_data.shape
space = cls.base_class.search_space(data_size=data_size, task=task, **params)
scale, _ = cls.adjust_scale(data.next_scale(), len(data.train_data), pred_horizon)
max_lags = max(3 * scale, int(np.sqrt(data_size[0])))
max_lags = min(max_lags, data_size[0] - pred_horizon - 1)
space.update(
{
"lags": {
"domain": tune.randint(lower=1, upper=max_lags),
"init_value": min(max_lags, scale),
},
}
)
return space
def __init__(self, task="ts_forecast", **params):
# TODO: pass task objects throughout
super().__init__(task, **params)
self._model = None
self.ts_task = task
def fit(self, X_train, y_train=None, budget=None, **kwargs):
super().fit(X_train, y_train, budget=budget, **kwargs)
X_train = self.enrich(X_train)
current_time = time.time()
if isinstance(X_train, TimeSeriesDataset):
data = X_train
X_train = data.train_data[data.regressors + [data.time_col]]
self.regressors = data.regressors
# this class only supports univariate regression
y_train = data.y_train
self.time_col = data.time_col
self.target_names = data.target_names
elif isinstance(X_train, DataFrame):
self.time_col = X_train.columns.tolist()[0]
# X_train = self.transform_X(X_train)
self.regressors = X_train.columns.tolist()[1:]
else:
raise ValueError("Unknown X type")
X_train = self._preprocess(X_train)
est_params = {k: v for k, v in self.params.items() if k not in self.top_search_space().keys()}
from flaml.automl.time_series.sklearn import SklearnWrapper
horizon = kwargs.pop("period")
lags = est_params.pop("lags")
est_params["task"] = self._task
self._model = SklearnWrapper(
self.base_class,
horizon=horizon,
lags=lags,
init_params=est_params,
pca_features=self.params.get("pca_features", False),
)
self._model.fit(X_train[self.regressors], y_train)
train_time = time.time() - current_time
return train_time
def predict(self, X, **kwargs):
X = self.enrich(X)
if isinstance(X, TimeSeriesDataset):
data = X
X = data.test_data
if self._model is not None:
X = X[self.regressors]
# X = self.transform_X(X)
X = self._preprocess(X)
forecast = self._model.predict(X)
if isinstance(forecast, Series):
forecast.name = self.target_names[0]
return forecast
else:
logger.warning("Estimator is not fit yet. Please run fit() before predict().")
return np.ones(X.shape[0])
class LGBM_TS(TS_SKLearn):
"""The class for tuning LGBM Regressor for time-series forecasting"""
base_class = LGBMEstimator
class XGBoost_TS(TS_SKLearn):
"""The class for tuning XGBoost Regressor for time-series forecasting"""
base_class = XGBoostSklearnEstimator
class RF_TS(TS_SKLearn):
"""The class for tuning Random Forest Regressor for time-series forecasting"""
base_class = RandomForestEstimator
class ExtraTrees_TS(TS_SKLearn):
"""The class for tuning Extra Trees Regressor for time-series forecasting"""
base_class = ExtraTreesEstimator
class XGBoostLimitDepth_TS(TS_SKLearn):
"""The class for tuning XGBoost Regressor with unlimited depth for time-series forecasting"""
base_class = XGBoostLimitDepthEstimator
# catboost regressor is invalid because it has a `name` parameter, making it incompatible with hcrystalball
class CatBoost_TS(TS_SKLearn):
base_class = CatBoostEstimator

View File

@ -1,179 +0,0 @@
"""!
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
"""
import json
from typing import IO
from contextlib import contextmanager
import logging
logger = logging.getLogger("flaml.automl")
class TrainingLogRecord(object):
def __init__(
self,
record_id: int,
iter_per_learner: int,
logged_metric: float,
trial_time: float,
wall_clock_time: float,
validation_loss: float,
config: dict,
learner: str,
sample_size: int,
):
self.record_id = record_id
self.iter_per_learner = iter_per_learner
self.logged_metric = logged_metric
self.trial_time = trial_time
self.wall_clock_time = wall_clock_time
self.validation_loss = float(validation_loss)
self.config = config
self.learner = learner
self.sample_size = sample_size
def dump(self, fp: IO[str]):
d = vars(self)
return json.dump(d, fp)
@classmethod
def load(cls, json_str: str):
d = json.loads(json_str)
return cls(**d)
def __str__(self):
return json.dumps(vars(self))
class TrainingLogCheckPoint(TrainingLogRecord):
def __init__(self, curr_best_record_id: int):
self.curr_best_record_id = curr_best_record_id
class TrainingLogWriter(object):
def __init__(self, output_filename: str):
self.output_filename = output_filename
self.file = None
self.current_best_loss_record_id = None
self.current_best_loss = float("+inf")
self.current_sample_size = None
self.current_record_id = 0
def open(self):
self.file = open(self.output_filename, "w")
def append_open(self):
self.file = open(self.output_filename, "a")
def append(
self,
it_counter: int,
train_loss: float,
trial_time: float,
wall_clock_time: float,
validation_loss,
config,
learner,
sample_size,
):
if self.file is None:
raise IOError("Call open() to open the output file first.")
if validation_loss is None:
raise ValueError("TEST LOSS NONE ERROR!!!")
record = TrainingLogRecord(
self.current_record_id,
it_counter,
train_loss,
trial_time,
wall_clock_time,
validation_loss,
config,
learner,
sample_size,
)
if (
validation_loss < self.current_best_loss
or validation_loss == self.current_best_loss
and self.current_sample_size is not None
and sample_size > self.current_sample_size
):
self.current_best_loss = validation_loss
self.current_sample_size = sample_size
self.current_best_loss_record_id = self.current_record_id
self.current_record_id += 1
record.dump(self.file)
self.file.write("\n")
self.file.flush()
def checkpoint(self):
if self.file is None:
raise IOError("Call open() to open the output file first.")
if self.current_best_loss_record_id is None:
logger.warning("flaml.training_log: checkpoint() called before any record is written, skipped.")
return
record = TrainingLogCheckPoint(self.current_best_loss_record_id)
record.dump(self.file)
self.file.write("\n")
self.file.flush()
def close(self):
if self.file is not None:
self.file.close()
self.file = None # for pickle
class TrainingLogReader(object):
def __init__(self, filename: str):
self.filename = filename
self.file = None
def open(self):
self.file = open(self.filename)
def records(self):
if self.file is None:
raise IOError("Call open() before reading log file.")
for line in self.file:
data = json.loads(line)
if len(data) == 1:
# Skip checkpoints.
continue
yield TrainingLogRecord(**data)
def close(self):
if self.file is not None:
self.file.close()
self.file = None # for pickle
def get_record(self, record_id) -> TrainingLogRecord:
if self.file is None:
raise IOError("Call open() before reading log file.")
for rec in self.records():
if rec.record_id == record_id:
return rec
raise ValueError(f"Cannot find record with id {record_id}.")
@contextmanager
def training_log_writer(filename: str, append: bool = False):
try:
w = TrainingLogWriter(filename)
if not append:
w.open()
else:
w.append_open()
yield w
finally:
w.close()
@contextmanager
def training_log_reader(filename: str):
try:
r = TrainingLogReader(filename)
r.open()
yield r
finally:
r.close()

View File

@ -1,15 +0,0 @@
"""!
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
"""
N_SPLITS = 5
RANDOM_SEED = 1
SPLIT_RATIO = 0.1
MEM_THRES = 4 * (1024**3)
SMALL_LARGE_THRES = 10000000
MIN_SAMPLE_TRAIN = 10000
CV_HOLDOUT_THRESHOLD = 100000
SAMPLE_MULTIPLY_FACTOR = 4
SEARCH_THREAD_EPS = 1.0
PENALTY = 1e10 # penalty term for constraints

View File

@ -1,9 +0,0 @@
import warnings
from flaml.automl.data import *
warnings.warn(
"Importing from `flaml.data` is deprecated. Please use `flaml.automl.data`.",
DeprecationWarning,
)

View File

@ -1,184 +0,0 @@
# FLAML-Zero: Zero-shot AutoML
## Zero-shot AutoML
There are several ways to use zero-shot AutoML, i.e., train a model with the data-dependent default configuration.
0. Use estimators in `flaml.default.estimator`.
```python
from flaml.default import LGBMRegressor
estimator = LGBMRegressor()
estimator.fit(X_train, y_train)
estimator.predict(X_test, y_test)
```
1. Use AutoML.fit(). set `starting_points="data"` and `max_iter=0`.
```python
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
automl = AutoML()
automl_settings = {
"time_budget": 2,
"task": "classification",
"log_file_name": "test/iris.log",
"starting_points": "data",
"max_iter": 0,
}
automl.fit(X_train, y_train, **automl_settings)
```
2. Use `flaml.default.preprocess_and_suggest_hyperparams`.
```python
from flaml.default import preprocess_and_suggest_hyperparams
X, y = load_iris(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
hyperparams, estimator_class, X_transformed, y_transformed, feature_transformer, label_transformer = preprocess_and_suggest_hyperparams(
"classification", X_train, y_train, "lgbm"
)
model = estimator_class(**hyperparams) # estimator_class is LGBMClassifier
model.fit(X_transformed, y_train) # LGBMClassifier can handle raw labels
X_test = feature_transformer.transform(X_test) # preprocess test data
y_pred = model.predict(X_test)
```
If you want to use your own meta-learned defaults, specify the path containing the meta-learned defaults. For example,
```python
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
automl = AutoML()
automl_settings = {
"time_budget": 2,
"task": "classification",
"log_file_name": "test/iris.log",
"starting_points": "data:test/default",
"estimator_list": ["lgbm", "xgb_limitdepth", "rf"]
"max_iter": 0,
}
automl.fit(X_train, y_train, **automl_settings)
```
Since this is a multiclass task, it will look for the following files under `test/default/`:
- `all/multiclass.json`.
- `{learner_name}/multiclass.json` for every learner_name in the estimator_list.
Read the next subsection to understand how to generate these files if you would like to meta-learn the defaults yourself.
To perform hyperparameter search starting with the data-dependent defaults, remove `max_iter=0`.
## Perform Meta Learning
FLAML provides a package `flaml.default` to learn defaults customized for your own tasks/learners/metrics.
### Prepare a collection of training tasks
Collect a diverse set of training tasks. For each task, extract its meta feature and save in a .csv file. For example, test/default/all/metafeatures.csv:
```
Dataset,NumberOfInstances,NumberOfFeatures,NumberOfClasses,PercentageOfNumericFeatures
2dplanes,36691,10,0,1.0
adult,43957,14,2,0.42857142857142855
Airlines,485444,7,2,0.42857142857142855
Albert,382716,78,2,0.3333333333333333
Amazon_employee_access,29492,9,2,0.0
bng_breastTumor,104976,9,0,0.1111111111111111
bng_pbc,900000,18,0,0.5555555555555556
car,1555,6,4,0.0
connect-4,60801,42,3,0.0
dilbert,9000,2000,5,1.0
Dionis,374569,60,355,1.0
poker,922509,10,0,1.0
```
The first column is the dataset name, and the latter four are meta features.
### Prepare the candidate configurations
You can extract the best configurations for each task in your collection of training tasks by running flaml on each of them with a long enough budget. Save the best configuration in a .json file under `{location_for_defaults}/{learner_name}/{task_name}.json`. For example,
```python
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
automl.fit(X_train, y_train, estimator_list=["lgbm"], **settings)
automl.save_best_config("test/default/lgbm/iris.json")
```
### Evaluate each candidate configuration on each task
Save the evaluation results in a .csv file. For example, save the evaluation results for lgbm under `test/default/lgbm/results.csv`:
```
task,fold,type,result,params
2dplanes,0,regression,0.946366,{'_modeljson': 'lgbm/2dplanes.json'}
2dplanes,0,regression,0.907774,{'_modeljson': 'lgbm/adult.json'}
2dplanes,0,regression,0.901643,{'_modeljson': 'lgbm/Airlines.json'}
2dplanes,0,regression,0.915098,{'_modeljson': 'lgbm/Albert.json'}
2dplanes,0,regression,0.302328,{'_modeljson': 'lgbm/Amazon_employee_access.json'}
2dplanes,0,regression,0.94523,{'_modeljson': 'lgbm/bng_breastTumor.json'}
2dplanes,0,regression,0.945698,{'_modeljson': 'lgbm/bng_pbc.json'}
2dplanes,0,regression,0.946194,{'_modeljson': 'lgbm/car.json'}
2dplanes,0,regression,0.945549,{'_modeljson': 'lgbm/connect-4.json'}
2dplanes,0,regression,0.946232,{'_modeljson': 'lgbm/default.json'}
2dplanes,0,regression,0.945594,{'_modeljson': 'lgbm/dilbert.json'}
2dplanes,0,regression,0.836996,{'_modeljson': 'lgbm/Dionis.json'}
2dplanes,0,regression,0.917152,{'_modeljson': 'lgbm/poker.json'}
adult,0,binary,0.927203,{'_modeljson': 'lgbm/2dplanes.json'}
adult,0,binary,0.932072,{'_modeljson': 'lgbm/adult.json'}
adult,0,binary,0.926563,{'_modeljson': 'lgbm/Airlines.json'}
adult,0,binary,0.928604,{'_modeljson': 'lgbm/Albert.json'}
adult,0,binary,0.911171,{'_modeljson': 'lgbm/Amazon_employee_access.json'}
adult,0,binary,0.930645,{'_modeljson': 'lgbm/bng_breastTumor.json'}
adult,0,binary,0.928603,{'_modeljson': 'lgbm/bng_pbc.json'}
adult,0,binary,0.915825,{'_modeljson': 'lgbm/car.json'}
adult,0,binary,0.919499,{'_modeljson': 'lgbm/connect-4.json'}
adult,0,binary,0.930109,{'_modeljson': 'lgbm/default.json'}
adult,0,binary,0.932453,{'_modeljson': 'lgbm/dilbert.json'}
adult,0,binary,0.921959,{'_modeljson': 'lgbm/Dionis.json'}
adult,0,binary,0.910763,{'_modeljson': 'lgbm/poker.json'}
...
```
The `type` column indicates the type of the task, such as regression, binary or multiclass.
The `result` column stores the evaluation result, assuming the large the better. The `params` column indicates which json config is used. For example 'lgbm/2dplanes.json' indicates that the best lgbm configuration extracted from 2dplanes is used.
### Learn data-dependent defaults
To recap, the inputs required for meta-learning are:
1. Metafeatures: e.g., `{location}/all/metafeatures.csv`.
1. Configurations: `{location}/{learner_name}/{task_name}.json`.
1. Evaluation results: `{location}/{learner_name}/results.csv`.
For example, if the input location is "test/default", learners are lgbm, xgb_limitdepth and rf, the following command learns data-dependent defaults for binary classification tasks.
```bash
python portfolio.py --output test/default --input test/default --metafeatures test/default/all/metafeatures.csv --task binary --estimator lgbm xgb_limitdepth rf
```
It will produce the following files as output:
- test/default/lgbm/binary.json: the learned defaults for lgbm.
- test/default/xgb_limitdepth/binary.json: the learned defaults for xgb_limitdepth.
- test/default/rf/binary.json: the learned defaults for rf.
- test/default/all/binary.json: the learned defaults for lgbm, xgb_limitdepth and rf together.
Change "binary" into "multiclass" or "regression" for the other tasks.
## Reference
For more technical details, please check our research paper.
* [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. arXiv preprint arXiv:2202.09927 (2022).
```bibtex
@article{Kayali2022default,
title={Mining Robust Default Configurations for Resource-constrained AutoML},
author={Moe Kayali and Chi Wang},
year={2022},
journal={arXiv preprint arXiv:2202.09927},
}
```

View File

@ -1,18 +0,0 @@
from .suggest import (
suggest_config,
suggest_learner,
suggest_hyperparams,
preprocess_and_suggest_hyperparams,
meta_feature,
)
from .estimator import (
flamlize_estimator,
LGBMClassifier,
LGBMRegressor,
XGBClassifier,
XGBRegressor,
RandomForestClassifier,
RandomForestRegressor,
ExtraTreesClassifier,
ExtraTreesRegressor,
)

View File

@ -1,946 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 2541,
"num_leaves": 1667,
"min_child_samples": 29,
"learning_rate": 0.0016660662914022302,
"log_max_bin": 8,
"colsample_bytree": 0.5157078343718623,
"reg_alpha": 0.045792841240713165,
"reg_lambda": 0.0012362651138125363,
"FLAML_sample_size": 436899
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 141,
"num_leaves": 139,
"min_child_samples": 8,
"learning_rate": 0.04824748268727149,
"log_max_bin": 9,
"colsample_bytree": 0.5261441571042451,
"reg_alpha": 0.002896920833899335,
"reg_lambda": 0.024463247502165594
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 31204,
"num_leaves": 4,
"min_child_samples": 3,
"learning_rate": 0.009033979476164342,
"log_max_bin": 10,
"colsample_bytree": 0.5393339924944204,
"reg_alpha": 15.800090067239827,
"reg_lambda": 34.82471227276953
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 362,
"num_leaves": 1208,
"min_child_samples": 8,
"learning_rate": 0.02070742242160566,
"log_max_bin": 4,
"colsample_bytree": 0.37915528071680865,
"reg_alpha": 0.002982599447751338,
"reg_lambda": 1.136605174453919,
"FLAML_sample_size": 337147
}
},
{
"class": "lgbm",
"hyperparameters": {}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 319,
"max_leaves": 1312,
"min_child_weight": 0.001,
"learning_rate": 0.01872379806270421,
"subsample": 0.6890079660561895,
"colsample_bylevel": 0.7551225121854014,
"colsample_bytree": 0.7860755604500558,
"reg_alpha": 0.17028752704343114,
"reg_lambda": 1.4375743264564231
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 7902,
"max_leaves": 49,
"min_child_weight": 0.038063497848955595,
"learning_rate": 0.0009765625,
"subsample": 0.9357800695141445,
"colsample_bylevel": 0.47031312177249246,
"colsample_bytree": 0.9053386579586192,
"reg_alpha": 1.5286102593845932,
"reg_lambda": 18.96811296717419
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 13499,
"max_leaves": 60,
"min_child_weight": 0.008494221584011285,
"learning_rate": 0.006955765856675575,
"subsample": 0.5965241023754743,
"colsample_bylevel": 0.590641168068946,
"colsample_bytree": 1.0,
"reg_alpha": 0.2522240954379289,
"reg_lambda": 5.351809144038808
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 591,
"max_leaves": 16651,
"min_child_weight": 0.03356567864689129,
"learning_rate": 0.002595066436678338,
"subsample": 0.9114132805513452,
"colsample_bylevel": 0.9503441844594458,
"colsample_bytree": 0.5703338448066768,
"reg_alpha": 0.010405212349127894,
"reg_lambda": 0.05352660657433639
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 877,
"max_depth": 11,
"min_child_weight": 0.6205465771093738,
"learning_rate": 0.013622118381700795,
"subsample": 0.566692814245426,
"colsample_bylevel": 0.8865741642101924,
"colsample_bytree": 1.0,
"reg_alpha": 0.01386336444764391,
"reg_lambda": 3.113947886074155
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 5457,
"max_depth": 6,
"min_child_weight": 0.19978269031877885,
"learning_rate": 0.003906732665632749,
"subsample": 0.8207785234496902,
"colsample_bylevel": 0.8438751931476698,
"colsample_bytree": 0.42202862997585794,
"reg_alpha": 0.017372558844968737,
"reg_lambda": 0.03977802121721031
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 3526,
"max_depth": 13,
"min_child_weight": 0.0994486725676356,
"learning_rate": 0.0009765625,
"subsample": 0.46123759274652554,
"colsample_bylevel": 1.0,
"colsample_bytree": 0.4498813776397717,
"reg_alpha": 0.002599398546499414,
"reg_lambda": 0.028336396854402753
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 501,
"max_features": 0.24484242524861066,
"max_leaves": 1156,
"criterion": "entropy"
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 356,
"max_features": 0.1,
"max_leaves": 102,
"criterion": "gini"
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 1000,
"max_features": 0.1779692423238241,
"max_leaves": 7499,
"criterion": "gini"
}
},
{
"class": "rf",
"hyperparameters": {}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 1080,
"max_features": 1.0,
"max_leaves": 590,
"criterion": "entropy"
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 2047,
"max_features": 0.46132798093546956,
"max_leaves": 12856,
"criterion": "gini"
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 408,
"max_features": 0.3629795757973625,
"max_leaves": 81,
"criterion": "entropy"
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 553,
"max_features": 0.9592132391435095,
"max_leaves": 1127,
"criterion": "entropy"
}
},
{
"class": "extra_tree",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
18000.0,
28.0,
2.0,
0.7565217391304347
],
"scale": [
42124.0,
130.0,
1.0,
0.5714285714285715
]
},
"neighbors": [
{
"features": [
1.196467571930491,
1.0923076923076922,
0.0,
0.4260869565217391
],
"choice": [
5,
18,
19,
4,
8,
3,
9,
7,
10,
6,
21,
2,
20,
17,
13,
16,
15,
1,
14,
12,
0,
11
]
},
{
"features": [
11.096856898680088,
-0.16153846153846155,
0.0,
-0.5739130434782609
],
"choice": [
0,
5,
7,
9,
11,
8,
1,
18,
15,
12,
3,
2,
10,
20,
4,
6,
13,
17,
14,
19,
16,
21
]
},
{
"features": [
8.658152122305575,
0.38461538461538464,
0.0,
-0.7405797101449274
],
"choice": [
7,
9,
2,
5,
10,
1,
0,
3,
12,
4,
6,
11,
8,
18,
15,
13,
20,
16,
17,
21,
14,
19
]
},
{
"features": [
0.27281359794891274,
-0.14615384615384616,
0.0,
-1.3239130434782607
],
"choice": [
8,
11,
0,
5,
1,
15,
13,
16,
10,
9,
20,
7,
17,
12,
4,
3,
21,
18,
6,
14,
19,
2
]
},
{
"features": [
-0.4125676573924604,
-0.1076923076923077,
0.0,
-0.5739130434782609
],
"choice": [
19,
15,
11,
17,
8,
14,
13,
16,
3,
18,
21,
6,
9,
10,
20,
5,
7,
1,
0,
12,
2,
4
]
},
{
"features": [
0.6409647706770487,
1.5538461538461539,
0.0,
0.0
],
"choice": [
2,
14,
10,
19,
6,
0,
1,
4,
11,
3,
5,
17,
9,
13,
12,
20,
7,
15,
18,
8,
16,
21
]
},
{
"features": [
2.3515573069983855,
0.16923076923076924,
0.0,
0.4260869565217391
],
"choice": [
7,
9,
10,
5,
2,
0,
3,
1,
12,
4,
6,
11,
18,
8,
15,
13,
16,
21,
20,
17,
14,
19
]
},
{
"features": [
0.6162045389801538,
-0.1076923076923077,
0.0,
-0.5739130434782609
],
"choice": [
10,
12,
1,
4,
11,
6,
9,
0,
2,
5,
3,
7,
8,
13,
20,
17,
15,
14,
16,
19,
18,
21
]
},
{
"features": [
0.5386240622922799,
-0.09230769230769231,
0.0,
-0.5582880434782608
],
"choice": [
1,
0,
5,
11,
10,
9,
6,
4,
3,
20,
17,
18,
13,
15,
16,
8,
7,
2,
12,
21,
19,
14
]
},
{
"features": [
-0.41133320672300827,
-0.18461538461538463,
0.0,
0.4260869565217391
],
"choice": [
14,
9,
7,
10,
15,
13,
3,
6,
16,
5,
19,
2,
12,
18,
4,
21,
20,
0,
11,
17,
1,
8
]
},
{
"features": [
-0.31155635742094767,
12.36923076923077,
0.0,
0.3865087169129372
],
"choice": [
7,
2,
6,
10,
3,
0,
9,
20,
5,
1,
18,
11,
8,
17,
4,
13,
15,
12,
14,
16,
19,
21
]
},
{
"features": [
-0.40594435476213087,
-0.06153846153846154,
0.0,
-0.7114130434782607
],
"choice": [
9,
5,
6,
1,
0,
13,
15,
7,
19,
4,
16,
3,
10,
12,
11,
18,
14,
8,
17,
20,
21,
2
]
},
{
"features": [
0.0,
32.83076923076923,
0.0,
0.4260869565217391
],
"choice": [
20,
17,
0,
1,
18,
3,
13,
9,
10,
5,
11,
15,
2,
4,
12,
16,
14,
19,
21
]
},
{
"features": [
1.6675766783781218,
0.0,
0.0,
0.4260869565217391
],
"choice": [
7,
9,
5,
0,
1,
10,
6,
11,
4,
2,
12,
3,
8,
15,
13,
18,
16,
20,
17,
21,
14,
19
]
},
{
"features": [
-0.36356946158959264,
0.8923076923076924,
0.0,
-1.2266908212560386
],
"choice": [
8,
15,
3,
13,
16,
11,
4,
0,
20,
6,
14,
5,
1,
21,
17,
9,
10,
18,
19,
7,
12,
2
]
},
{
"features": [
-0.38225239768303104,
-0.05384615384615385,
0.0,
0.4260869565217391
],
"choice": [
16,
13,
15,
18,
17,
14,
20,
8,
10,
9,
3,
7,
19,
21,
11,
1,
5,
0,
6,
4,
2,
12
]
},
{
"features": [
-0.3590352293229513,
0.06153846153846154,
0.0,
-1.3239130434782607
],
"choice": [
7,
9,
10,
4,
5,
17,
19,
20,
12,
18,
6,
13,
16,
0,
1,
3,
15,
21,
14,
11,
8,
2
]
},
{
"features": [
0.3090399772101415,
0.6923076923076923,
0.0,
-0.003997789240972687
],
"choice": [
7,
9,
10,
1,
12,
5,
3,
4,
0,
11,
20,
8,
17,
13,
6,
15,
16,
21,
18,
2,
14,
19
]
},
{
"features": [
-0.3118649700883107,
-0.17692307692307693,
0.0,
0.4260869565217391
],
"choice": [
20,
18,
21,
17,
7,
9,
15,
13,
1,
16,
4,
12,
5,
0,
10,
14,
6,
11,
8,
3,
2,
19
]
},
{
"features": [
0.0,
32.83076923076923,
0.0,
0.4260869565217391
],
"choice": [
9,
10,
0,
5,
1,
12,
3,
4,
2,
21,
11,
16,
18,
20,
15,
8,
17,
13,
14,
19
]
},
{
"features": [
-0.3178473079479632,
-0.06153846153846154,
0.0,
0.4260869565217391
],
"choice": [
18,
17,
20,
1,
5,
21,
0,
8,
4,
3,
10,
12,
9,
13,
11,
6,
16,
15,
7,
19,
14,
2
]
}
],
"configsource": [
"lgbm/Airlines",
"lgbm/riccardo",
"lgbm/fried",
"lgbm/Dionis",
"lgbm/default",
"xgboost/fabert",
"xgboost/bng_lowbwt",
"xgboost/pol",
"xgboost/Amazon_employee_access",
"xgb_limitdepth/Jannis",
"xgb_limitdepth/adult",
"xgb_limitdepth/Amazon_employee_access",
"xgb_limitdepth/default",
"rf/Amazon_employee_access",
"rf/kc1",
"rf/Helena",
"rf/default",
"extra_tree/segment",
"extra_tree/Helena",
"extra_tree/kr-vs-kp",
"extra_tree/bank-marketing",
"extra_tree/default"
]
}

File diff suppressed because it is too large Load Diff

View File

@ -1,885 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 4797,
"num_leaves": 122,
"min_child_samples": 2,
"learning_rate": 0.022635758411078528,
"log_max_bin": 9,
"colsample_bytree": 0.7019911744574896,
"reg_alpha": 0.004252223402511765,
"reg_lambda": 0.11288241427227624
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 1009,
"num_leaves": 42,
"min_child_samples": 12,
"learning_rate": 0.02167229637171611,
"log_max_bin": 7,
"colsample_bytree": 0.7385038460573171,
"reg_alpha": 0.003607184551842614,
"reg_lambda": 12.08340803550741
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 32767,
"num_leaves": 372,
"min_child_samples": 4,
"learning_rate": 0.03517259015200922,
"log_max_bin": 5,
"colsample_bytree": 1.0,
"reg_alpha": 0.02271142170225636,
"reg_lambda": 0.001963791798843179,
"FLAML_sample_size": 830258
}
},
{
"class": "lgbm",
"hyperparameters": {}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 6357,
"max_leaves": 206,
"min_child_weight": 1.9495322566288034,
"learning_rate": 0.0068766724195393905,
"subsample": 0.9451618245005704,
"colsample_bylevel": 0.9030482524943064,
"colsample_bytree": 0.9278972006416252,
"reg_alpha": 0.01857648400903689,
"reg_lambda": 6.021166480604588,
"FLAML_sample_size": 344444
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 23045,
"max_leaves": 247,
"min_child_weight": 0.004319397499079841,
"learning_rate": 0.0032914413473281215,
"subsample": 0.7334190564433234,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.03514226467919635,
"reg_lambda": 1.2679661021665851
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 1899,
"max_leaves": 59,
"min_child_weight": 0.013389019900720164,
"learning_rate": 0.0028943401472847964,
"subsample": 0.7808944208233943,
"colsample_bylevel": 1.0,
"colsample_bytree": 0.9999355357362375,
"reg_alpha": 0.7905117773932884,
"reg_lambda": 2.916897119216104
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 5611,
"max_leaves": 61,
"min_child_weight": 0.01070518287797225,
"learning_rate": 0.005485127037677848,
"subsample": 0.4713518256961299,
"colsample_bylevel": 0.9777437906530106,
"colsample_bytree": 0.9519335125615331,
"reg_alpha": 0.03621564207188963,
"reg_lambda": 1.8045765669466283
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 4923,
"max_depth": 12,
"min_child_weight": 0.7625732991776795,
"learning_rate": 0.009239549681857523,
"subsample": 0.8193164619615052,
"colsample_bylevel": 0.7785754297307862,
"colsample_bytree": 0.788491073979525,
"reg_alpha": 0.002282749364196872,
"reg_lambda": 131.2194560716441
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 2111,
"max_depth": 9,
"min_child_weight": 3.405822241186395,
"learning_rate": 0.005804247705198151,
"subsample": 0.37848422782052427,
"colsample_bylevel": 0.8228350674288559,
"colsample_bytree": 0.8813475713109656,
"reg_alpha": 0.009761356063132219,
"reg_lambda": 13.187783936727843,
"FLAML_sample_size": 810000
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 1499,
"max_depth": 11,
"min_child_weight": 0.07563529776156448,
"learning_rate": 0.039042609221240955,
"subsample": 0.7832981935783824,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.0009765625,
"reg_lambda": 23.513066752844153
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 19722,
"max_depth": 11,
"min_child_weight": 6.46800727978204,
"learning_rate": 0.0010837437950202355,
"subsample": 0.49509562408032115,
"colsample_bylevel": 1.0,
"colsample_bytree": 0.8826299329274134,
"reg_alpha": 0.23887161121959208,
"reg_lambda": 15.163773888208217
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 544,
"max_depth": 12,
"min_child_weight": 79.32555867011995,
"learning_rate": 0.010128107120014433,
"subsample": 0.9799974977817297,
"colsample_bylevel": 0.881815418056542,
"colsample_bytree": 0.9718556912196423,
"reg_alpha": 72.63148950428749,
"reg_lambda": 1.4601415712058006
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 960,
"max_features": 0.694616932858775,
"max_leaves": 8937
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 2047,
"max_features": 1.0,
"max_leaves": 32767,
"FLAML_sample_size": 830258
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 2047,
"max_features": 0.6683903035731483,
"max_leaves": 591,
"criterion": "entropy"
}
},
{
"class": "rf",
"hyperparameters": {}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 1233,
"max_features": 1.0,
"max_leaves": 6452
}
},
{
"class": "extra_tree",
"hyperparameters": {}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 346,
"max_features": 1.0,
"max_leaves": 1007,
"criterion": "entropy"
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 2047,
"max_features": 0.5106397565689275,
"max_leaves": 32767,
"FLAML_sample_size": 319382
}
}
],
"preprocessing": {
"center": [
36691.0,
10.0,
0.0,
0.85
],
"scale": [
463680.0,
8.5,
1.0,
0.48611111111111116
]
},
"neighbors": [
{
"features": [
0.0,
0.0,
0.0,
0.3085714285714286
],
"choice": [
3,
6,
12,
1,
16,
20,
7,
13,
9,
8,
4,
11,
0,
14,
18,
15,
5,
17,
10,
21,
2,
19
]
},
{
"features": [
0.6972675120772946,
10.588235294117647,
0.0,
0.3085714285714286
],
"choice": [
19,
18,
21,
20
]
},
{
"features": [
-0.05244133885438233,
3.5294117647058822,
0.0,
0.3085714285714286
],
"choice": [
1,
0,
3,
14,
17,
15,
16,
10,
8,
18,
2,
19,
20,
4,
21,
13,
9,
5,
7,
11,
6,
12
]
},
{
"features": [
1.8618637853692201,
-0.11764705882352941,
0.0,
-0.3771428571428571
],
"choice": [
12,
7,
4,
9,
13,
8,
1,
6,
3,
5,
16,
10,
0,
18,
14,
20,
15,
17,
19,
2,
21
]
},
{
"features": [
0.1472675120772947,
-0.11764705882352941,
0.0,
-1.52
],
"choice": [
1,
12,
9,
3,
7,
6,
11,
13,
16,
20,
8,
4,
18,
0,
10,
14,
21,
5,
15,
17,
2,
19
]
},
{
"features": [
-0.045171238785369223,
-0.11764705882352941,
0.0,
-0.3771428571428571
],
"choice": [
12,
6,
1,
3,
16,
9,
20,
15,
14,
11,
7,
21,
18,
17,
4,
8,
19,
5,
13,
0,
10,
2
]
},
{
"features": [
1.8618637853692201,
9.411764705882353,
0.0,
0.3085714285714286
],
"choice": [
19,
18,
21,
20
]
},
{
"features": [
-0.018758626639061422,
-0.11764705882352941,
0.0,
-1.2914285714285714
],
"choice": [
6,
3,
12,
9,
1,
16,
20,
13,
7,
11,
8,
18,
4,
14,
10,
15,
0,
17,
21,
5,
19,
2
]
},
{
"features": [
1.8618637853692201,
0.9411764705882353,
0.0,
-0.6057142857142855
],
"choice": [
0,
5,
4,
8,
10,
12,
7,
9,
1,
2,
13,
3,
6,
14,
19,
17,
21,
18,
16,
20
]
},
{
"features": [
1.8618637853692201,
0.0,
0.0,
-1.5428571428571427
],
"choice": [
9,
7,
1,
4,
6,
3,
12,
13,
0,
8,
10,
5,
14,
16,
20,
18,
21,
15,
2,
17,
19
]
},
{
"features": [
0.2647105762594893,
0.0,
0.0,
0.3085714285714286
],
"choice": [
12,
6,
1,
3,
13,
7,
16,
9,
20,
0,
8,
4,
11,
14,
18,
5,
10,
15,
17,
21,
2,
19
]
},
{
"features": [
-0.058378623188405795,
0.23529411764705882,
0.0,
-0.3771428571428571
],
"choice": [
0,
3,
1,
2
]
},
{
"features": [
0.0,
0.0,
0.0,
0.3085714285714286
],
"choice": [
7,
9,
1,
11,
8,
0,
4,
5,
6,
3,
10,
2,
13,
12,
19,
18,
21,
15,
14,
17,
20,
16
]
},
{
"features": [
-0.03490769496204279,
0.7058823529411765,
0.0,
0.3085714285714286
],
"choice": [
7,
11,
5,
4,
9,
1,
8,
3,
6,
0,
10,
2,
17,
12,
15,
14,
16,
13,
19,
18,
21,
20
]
},
{
"features": [
-0.03490769496204279,
-0.23529411764705882,
0.0,
0.3085714285714286
],
"choice": [
6,
4,
8,
5,
7,
9,
11,
10,
3,
1,
18,
12,
21,
19,
0,
14,
16,
20,
15,
13,
17,
2
]
},
{
"features": [
-0.03906789164941339,
-0.23529411764705882,
0.0,
0.3085714285714286
],
"choice": [
0,
4,
7,
5,
11,
1,
8,
10,
9,
6,
12,
3,
13,
14,
15,
17,
16,
2,
21,
18,
19,
20
]
},
{
"features": [
0.0,
0.0,
0.0,
-0.3085714285714286
],
"choice": [
18,
19,
20,
10,
15,
17,
5,
11,
14,
4,
7,
9,
21,
8,
3,
6,
13,
1,
16,
12,
0,
2
]
},
{
"features": [
1.050207039337474,
0.9411764705882353,
0.0,
-0.7199999999999999
],
"choice": [
17,
15,
14,
16
]
},
{
"features": [
0.686201690821256,
-0.11764705882352941,
0.0,
-1.0628571428571427
],
"choice": [
15,
17,
14,
19,
16,
18,
21,
20
]
},
{
"features": [
1.9104080400276053,
0.0,
0.0,
0.3085714285714286
],
"choice": [
10,
2,
5,
8,
0,
4,
19,
7,
9,
13,
17,
15,
18,
21,
1,
14,
12,
20,
6,
3,
16
]
},
{
"features": [
-0.050015096618357485,
4.470588235294118,
0.0,
0.3085714285714286
],
"choice": [
8,
10,
4,
7,
5,
11,
18,
6,
20,
19,
9,
14,
16,
21,
0,
3,
15,
17,
1,
2,
13,
12
]
},
{
"features": [
-0.04660973084886128,
-0.8235294117647058,
0.0,
-1.0628571428571427
],
"choice": [
11,
13,
10,
8,
9,
20,
12,
18,
19,
21
]
}
],
"configsource": [
"lgbm/houses",
"lgbm/house_8L",
"lgbm/poker",
"lgbm/default",
"xgboost/Albert",
"xgboost/mv",
"xgboost/bng_echomonths",
"xgboost/house_16H",
"xgb_limitdepth/higgs",
"xgb_limitdepth/bng_pharynx",
"xgb_limitdepth/connect-4",
"xgb_limitdepth/house_16H",
"xgb_limitdepth/bng_echomonths",
"xgb_limitdepth/default",
"rf/houses",
"rf/poker",
"rf/bank-marketing",
"rf/default",
"extra_tree/house_16H",
"extra_tree/default",
"extra_tree/dilbert",
"extra_tree/particulate-matter"
]
}

View File

@ -1,184 +0,0 @@
from functools import wraps
from flaml.automl.task.task import CLASSIFICATION
from .suggest import preprocess_and_suggest_hyperparams
DEFAULT_LOCATION = "default_location"
def flamlize_estimator(super_class, name: str, task: str, alternatives=None):
"""Enhance an estimator class with flaml's data-dependent default hyperparameter settings.
Example:
```python
import sklearn.ensemble as ensemble
RandomForestRegressor = flamlize_estimator(
ensemble.RandomForestRegressor, "rf", "regression"
)
```
Args:
super_class: an scikit-learn compatible estimator class.
name: a str of the estimator's name.
task: a str of the task type.
alternatives: (Optional) a list for alternative estimator names. For example,
```[("max_depth", 0, "xgboost")]``` means if the "max_depth" is set to 0
in the constructor, then look for the learned defaults for estimator "xgboost".
"""
class EstimatorClass(super_class):
"""**Enhanced with flaml's data-dependent default hyperparameter settings.**"""
@wraps(super_class.__init__)
def __init__(self, **params):
if DEFAULT_LOCATION in params:
self._default_location = params.pop(DEFAULT_LOCATION)
else:
self._default_location = None
self._params = params
super().__init__(**params)
# @classmethod
# @wraps(super_class._get_param_names)
# def _get_param_names(cls):
# return super_class._get_param_names() if hasattr(super_class, "_get_param_names") else []
def suggest_hyperparams(self, X, y):
"""Suggest hyperparameters.
Example:
```python
from flaml.default import LGBMRegressor
estimator = LGBMRegressor()
hyperparams, estimator_name, X_transformed, y_transformed = estimator.fit(X_train, y_train)
print(hyperparams)
```
Args:
X: A dataframe of training data in shape n*m.
y: A series of labels in shape n*1.
Returns:
hyperparams: A dict of the hyperparameter configurations.
estimator_name: A str of the underlying estimator name, e.g., 'xgb_limitdepth'.
X_transformed: the preprocessed X.
y_transformed: the preprocessed y.
"""
estimator_name = name
if alternatives:
for alternative in alternatives:
if self._params.get(alternative[0]) == alternative[1]:
estimator_name = alternative[2]
break
estimator_name = (
"choose_xgb"
if (estimator_name == "xgb_limitdepth" and "max_depth" not in self._params)
else estimator_name
)
(
hyperparams,
estimator_class,
X_transformed,
y_transformed,
self._feature_transformer,
self._label_transformer,
) = preprocess_and_suggest_hyperparams(task, X, y, estimator_name, self._default_location)
assert estimator_class == super_class
hyperparams.update(self._params)
return hyperparams, estimator_name, X_transformed, y_transformed
@wraps(super_class.fit)
def fit(self, X, y, *args, **params):
hyperparams, estimator_name, X, y_transformed = self.suggest_hyperparams(X, y)
self.set_params(**hyperparams)
if self._label_transformer and estimator_name in [
"rf",
"extra_tree",
"xgboost",
"xgb_limitdepth",
"choose_xgb",
]:
# rf and et have trouble in handling boolean labels; xgboost requires integer labels
fitted = super().fit(X, y_transformed, *args, **params)
# if hasattr(self, "_classes"):
# self._classes = self._label_transformer.classes_
# else:
self.classes_ = self._label_transformer.classes_
if "xgb" not in estimator_name:
# rf and et would do inverse transform automatically; xgb doesn't
self._label_transformer = None
else:
# lgbm doesn't need label transformation except for non-str/num labels
try:
fitted = super().fit(X, y, *args, **params)
self._label_transformer = None
except ValueError:
# Unknown label type: 'unknown'
fitted = super().fit(X, y_transformed, *args, **params)
self._classes = self._label_transformer.classes_
return fitted
@wraps(super_class.predict)
def predict(self, X, *args, **params):
if name != "lgbm" or task not in CLASSIFICATION:
X = self._feature_transformer.transform(X)
y_pred = super().predict(X, *args, **params)
if self._label_transformer and y_pred.ndim == 1:
y_pred = self._label_transformer.inverse_transform(y_pred)
return y_pred
if hasattr(super_class, "predict_proba"):
@wraps(super_class.predict_proba)
def predict_proba(self, X, *args, **params):
X_test = self._feature_transformer.transform(X)
y_pred = super().predict_proba(X_test, *args, **params)
return y_pred
EstimatorClass.__doc__ += " " + super_class.__doc__
EstimatorClass.__name__ = super_class.__name__
return EstimatorClass
try:
import sklearn.ensemble as ensemble
except ImportError:
RandomForestClassifier = RandomForestRegressor = ExtraTreesClassifier = ExtraTreesRegressor = ImportError(
"Using flaml.default.* requires scikit-learn."
)
else:
RandomForestRegressor = flamlize_estimator(ensemble.RandomForestRegressor, "rf", "regression")
RandomForestClassifier = flamlize_estimator(ensemble.RandomForestClassifier, "rf", "classification")
ExtraTreesRegressor = flamlize_estimator(ensemble.ExtraTreesRegressor, "extra_tree", "regression")
ExtraTreesClassifier = flamlize_estimator(ensemble.ExtraTreesClassifier, "extra_tree", "classification")
try:
import lightgbm
except ImportError:
LGBMRegressor = LGBMClassifier = ImportError("Using flaml.default.LGBM* requires lightgbm.")
else:
LGBMRegressor = flamlize_estimator(lightgbm.LGBMRegressor, "lgbm", "regression")
LGBMClassifier = flamlize_estimator(lightgbm.LGBMClassifier, "lgbm", "classification")
try:
import xgboost
except ImportError:
XGBClassifier = XGBRegressor = ImportError("Using flaml.default.XGB* requires xgboost.")
else:
XGBRegressor = flamlize_estimator(
xgboost.XGBRegressor,
"xgb_limitdepth",
"regression",
[("max_depth", 0, "xgboost")],
)
XGBClassifier = flamlize_estimator(
xgboost.XGBClassifier,
"xgb_limitdepth",
"classification",
[("max_depth", 0, "xgboost")],
)
# if hasattr(xgboost.XGBRegressor, "_get_param_names"):
# XGBRegressor._get_param_names = xgboost.XGBRegressor._get_param_names
# XGBClassifier._get_param_names = xgboost.XGBClassifier._get_param_names

View File

@ -1,361 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 1080,
"max_features": 1.0,
"max_leaves": 590,
"criterion": "entropy"
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 2047,
"max_features": 0.46132798093546956,
"max_leaves": 12856,
"criterion": "gini"
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 408,
"max_features": 0.3629795757973625,
"max_leaves": 81,
"criterion": "entropy"
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 553,
"max_features": 0.9592132391435095,
"max_leaves": 1127,
"criterion": "entropy"
}
},
{
"class": "extra_tree",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
18000.0,
28.0,
2.0,
0.7565217391304347
],
"scale": [
42124.0,
130.0,
1.0,
0.5714285714285715
]
},
"neighbors": [
{
"features": [
1.196467571930491,
1.0923076923076922,
0.0,
0.4260869565217391
],
"choice": [
1,
2,
4
]
},
{
"features": [
11.096856898680088,
-0.16153846153846155,
0.0,
-0.5739130434782609
],
"choice": [
1,
3,
0,
2,
4
]
},
{
"features": [
8.658152122305575,
0.38461538461538464,
0.0,
-0.7405797101449274
],
"choice": [
1,
3,
0,
4
]
},
{
"features": [
0.27281359794891274,
-0.14615384615384616,
0.0,
-1.3239130434782607
],
"choice": [
3,
0,
4
]
},
{
"features": [
-0.4125676573924604,
-0.1076923076923077,
0.0,
-0.5739130434782609
],
"choice": [
2,
0,
1,
4
]
},
{
"features": [
0.6409647706770487,
1.5538461538461539,
0.0,
0.0
],
"choice": [
2,
0,
3,
1,
4
]
},
{
"features": [
2.3515573069983855,
0.16923076923076924,
0.0,
0.4260869565217391
],
"choice": [
1,
4
]
},
{
"features": [
0.6162045389801538,
-0.1076923076923077,
0.0,
-0.5739130434782609
],
"choice": [
3,
0,
2,
1,
4
]
},
{
"features": [
0.5386240622922799,
-0.09230769230769231,
0.0,
-0.5582880434782608
],
"choice": [
3,
0,
1,
4
]
},
{
"features": [
-0.41133320672300827,
-0.18461538461538463,
0.0,
0.4260869565217391
],
"choice": [
2,
1,
4
]
},
{
"features": [
-0.31155635742094767,
12.36923076923077,
0.0,
0.3865087169129372
],
"choice": [
3,
1,
0,
2,
4
]
},
{
"features": [
-0.40594435476213087,
-0.06153846153846154,
0.0,
-0.7114130434782607
],
"choice": [
2,
1,
0,
3,
4
]
},
{
"features": [
0.0,
32.83076923076923,
0.0,
0.4260869565217391
],
"choice": [
3,
0,
1,
2,
4
]
},
{
"features": [
1.6675766783781218,
0.0,
0.0,
0.4260869565217391
],
"choice": [
1,
3,
0,
4
]
},
{
"features": [
-0.36356946158959264,
0.8923076923076924,
0.0,
-1.2266908212560386
],
"choice": [
3,
4
]
},
{
"features": [
-0.38225239768303104,
-0.05384615384615385,
0.0,
0.4260869565217391
],
"choice": [
1,
0,
3,
2,
4
]
},
{
"features": [
-0.3590352293229513,
0.06153846153846154,
0.0,
-1.3239130434782607
],
"choice": [
0,
2,
3,
1,
4
]
},
{
"features": [
0.3090399772101415,
0.6923076923076923,
0.0,
-0.003997789240972687
],
"choice": [
3,
0,
4
]
},
{
"features": [
-0.3118649700883107,
-0.17692307692307693,
0.0,
0.4260869565217391
],
"choice": [
3,
1,
4
]
},
{
"features": [
0.0,
32.83076923076923,
0.0,
0.4260869565217391
],
"choice": [
4
]
},
{
"features": [
-0.3178473079479632,
-0.06153846153846154,
0.0,
0.4260869565217391
],
"choice": [
1,
0,
3,
4
]
}
],
"configsource": [
"segment",
"Helena",
"kr-vs-kp",
"bank-marketing",
"default"
]
}

View File

@ -1,310 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 1074,
"max_features": 0.6008299059364026,
"max_leaves": 9287
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 833,
"max_features": 0.055027081530106846,
"max_leaves": 1361,
"criterion": "gini"
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 2047,
"max_features": 0.9560062760906606,
"max_leaves": 32767,
"criterion": "entropy",
"FLAML_sample_size": 470620
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 812,
"max_features": 1.0,
"max_leaves": 1474,
"criterion": "entropy"
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 2047,
"max_features": 1.0,
"max_leaves": 18344
}
},
{
"class": "extra_tree",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
24668.5,
54.0,
7.0,
1.0
],
"scale": [
57198.0,
770.5,
6.0,
1.0
]
},
"neighbors": [
{
"features": [
8.710820308402392,
0.0,
0.0,
-0.8148148148148149
],
"choice": [
2,
4,
5
]
},
{
"features": [
0.6701545508584216,
0.9474367293964958,
0.5,
0.0
],
"choice": [
2,
0,
4,
3,
5
]
},
{
"features": [
0.5945575020105598,
-0.03504218040233614,
15.5,
0.0
],
"choice": [
4,
0,
3,
2,
1,
5
]
},
{
"features": [
0.8862285394594217,
0.0,
-0.5,
0.0
],
"choice": [
2,
4,
0,
3,
5
]
},
{
"features": [
-0.2739344033008147,
9.2744970798183,
0.5,
0.0
],
"choice": [
0,
1,
3,
5
]
},
{
"features": [
0.48133676002657433,
-0.058403634003893576,
0.0,
0.0
],
"choice": [
3,
2,
4,
0,
5
]
},
{
"features": [
0.4862145529563971,
0.16353017521090202,
0.5,
0.0
],
"choice": [
2,
4,
0,
3,
5
]
},
{
"features": [
-0.40409629707332423,
-0.06229720960415315,
-0.5,
-1.0
],
"choice": [
4,
2,
0,
5
]
},
{
"features": [
-0.41428896115248787,
1.0408825438027256,
0.3333333333333333,
0.0
],
"choice": [
1,
5
]
},
{
"features": [
0.6317091506696039,
-0.015574302401038288,
-0.6666666666666666,
-1.0
],
"choice": [
0,
2,
3,
5
]
},
{
"features": [
-0.2739344033008147,
2.5256327060350423,
-0.3333333333333333,
0.0
],
"choice": [
3,
2,
4,
0,
1,
5
]
},
{
"features": [
-0.30168012867582783,
0.9682024659312135,
0.0,
0.0
],
"choice": [
1,
5
]
},
{
"features": [
0.2739344033008147,
-0.06229720960415315,
-0.6666666666666666,
0.0
],
"choice": [
3,
0,
1,
5
]
},
{
"features": [
-0.39981293052204625,
0.21025308241401688,
0.5,
0.0
],
"choice": [
4,
2,
3,
0,
5
]
},
{
"features": [
-0.3949351375922235,
-0.04931862426995458,
0.0,
0.0
],
"choice": [
3,
2,
4,
0,
5
]
},
{
"features": [
-0.41797790132522117,
-0.04672290720311486,
-0.5,
0.0
],
"choice": [
4,
3,
2,
0,
5
]
}
],
"configsource": [
"houses",
"fabert",
"Covertype",
"Amazon_employee_access",
"fried",
"default"
]
}

View File

@ -1,312 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 1233,
"max_features": 1.0,
"max_leaves": 6452
}
},
{
"class": "extra_tree",
"hyperparameters": {}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 346,
"max_features": 1.0,
"max_leaves": 1007,
"criterion": "entropy"
}
},
{
"class": "extra_tree",
"hyperparameters": {
"n_estimators": 2047,
"max_features": 0.5106397565689275,
"max_leaves": 32767,
"FLAML_sample_size": 319382
}
}
],
"preprocessing": {
"center": [
36691.0,
10.0,
0.0,
1.0
],
"scale": [
474977.25,
7.5,
1.0,
0.5
]
},
"neighbors": [
{
"features": [
0.0,
0.0,
0.0,
0.0
],
"choice": [
2,
0,
3,
1
]
},
{
"features": [
0.6806831274550518,
12.0,
0.0,
0.0
],
"choice": [
1
]
},
{
"features": [
-0.05119403087200492,
4.0,
0.0,
0.0
],
"choice": [
0,
1
]
},
{
"features": [
1.817579684079606,
-0.13333333333333333,
0.0,
-0.6666666666666667
],
"choice": [
0,
3,
2,
1
]
},
{
"features": [
0.14376478031316237,
-0.13333333333333333,
0.0,
-1.7777777777777777
],
"choice": [
2,
0,
3,
1
]
},
{
"features": [
-0.044096848849076456,
-0.13333333333333333,
0.0,
-0.6666666666666667
],
"choice": [
2,
3,
0,
1
]
},
{
"features": [
1.817579684079606,
10.666666666666666,
0.0,
0.0
],
"choice": [
1
]
},
{
"features": [
-0.01831245601763032,
-0.13333333333333333,
0.0,
-1.5555555555555556
],
"choice": [
2,
0,
3,
1
]
},
{
"features": [
1.817579684079606,
1.0666666666666667,
0.0,
-0.8888888888888888
],
"choice": [
1
]
},
{
"features": [
1.817579684079606,
0.0,
0.0,
-1.8
],
"choice": [
2,
0,
3,
1
]
},
{
"features": [
0.2584144819567674,
0.0,
0.0,
0.0
],
"choice": [
2,
0,
3,
1
]
},
{
"features": [
0.0,
0.0,
0.0,
0.0
],
"choice": [
1
]
},
{
"features": [
-0.034077421602824134,
0.8,
0.0,
0.0
],
"choice": [
1
]
},
{
"features": [
-0.034077421602824134,
-0.26666666666666666,
0.0,
0.0
],
"choice": [
0,
3,
1
]
},
{
"features": [
-0.038138668746766295,
-0.26666666666666666,
0.0,
0.0
],
"choice": [
3,
0,
1
]
},
{
"features": [
0.0,
0.0,
0.0,
-0.6000000000000001
],
"choice": [
0,
1
]
},
{
"features": [
0.6698805048031248,
-0.13333333333333333,
0.0,
-1.3333333333333335
],
"choice": [
3,
1
]
},
{
"features": [
1.8649693222149062,
0.0,
0.0,
0.0
],
"choice": [
1
]
},
{
"features": [
-0.0488254963790371,
5.066666666666666,
0.0,
0.0
],
"choice": [
0,
2,
1
]
},
{
"features": [
-0.04550112663290715,
-0.9333333333333333,
0.0,
-1.3333333333333335
],
"choice": [
2,
0,
1
]
}
],
"configsource": [
"house_16H",
"default",
"dilbert",
"particulate-matter"
]
}

View File

@ -1,90 +0,0 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import pairwise_distances
def _augment(row):
max, avg, id = row.max(), row.mean(), row.index[0]
return row.apply(lambda x: (x, max, avg, id))
def construct_portfolio(regret_matrix, meta_features, regret_bound):
"""The portfolio construction algorithm.
(Reference)[https://arxiv.org/abs/2202.09927].
Args:
regret_matrix: A dataframe of regret matrix.
meta_features: None or a dataframe of metafeatures matrix.
When set to None, the algorithm uses greedy strategy.
Otherwise, the algorithm uses greedy strategy with feedback
from the nearest neighbor predictor.
regret_bound: A float of the regret bound.
Returns:
A list of configuration names.
"""
configs = []
all_configs = set(regret_matrix.index.tolist())
tasks = regret_matrix.columns
# pre-processing
if meta_features is not None:
scaler = RobustScaler()
meta_features = meta_features.loc[tasks]
meta_features.loc[:, :] = scaler.fit_transform(meta_features)
nearest_task = {}
for t in tasks:
other_meta_features = meta_features.drop(t)
dist = pd.DataFrame(
pairwise_distances(
meta_features.loc[t].to_numpy().reshape(1, -1),
other_meta_features,
metric="l2",
),
columns=other_meta_features.index,
)
nearest_task[t] = dist.idxmin(axis=1)
regret_matrix = regret_matrix.apply(_augment, axis=1)
print(regret_matrix)
def loss(configs):
"""Loss of config set `configs`, according to nearest neighbor config predictor."""
if meta_features is not None:
r = []
best_config_per_task = regret_matrix.loc[configs, :].min()
for t in tasks:
config = best_config_per_task[nearest_task[t]].iloc[0][-1]
r.append(regret_matrix[t][config][0])
else:
r = regret_matrix.loc[configs].min()
excessive_regret = (np.array(r) - regret_bound).clip(min=0).sum()
avg_regret = np.array(r).mean()
return excessive_regret, avg_regret
prev = np.inf
i = 0
eps = 1e-5
while True:
candidates = [configs + [d] for d in all_configs.difference(configs)]
losses, avg_regret = tuple(zip(*(loss(x) for x in candidates)))
sorted_losses = np.sort(losses)
if sorted_losses[1] - sorted_losses[0] < eps:
minloss = np.nanmin(losses)
print(f"tie detected at loss = {sorted_losses[0]}, using alternative metric.")
tied = np.flatnonzero(losses - minloss < eps)
losses = [(avg_regret[i], i) for i in tied]
minloss, ind = min(losses)
if minloss > prev - eps:
print(f"May be overfitting at k = {i + 1}, current = {minloss:.5f}, " f"prev = {prev:.5f}. Stopping.")
break
configs = candidates[ind]
prev = minloss
else:
configs = candidates[np.nanargmin(losses)]
i += 1
if sorted_losses[0] <= eps:
print(f"Reached target regret bound of {regret_bound}! k = {i}. Declining to pick further!")
break
return configs

View File

@ -1,370 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 2541,
"num_leaves": 1667,
"min_child_samples": 29,
"learning_rate": 0.0016660662914022302,
"log_max_bin": 8,
"colsample_bytree": 0.5157078343718623,
"reg_alpha": 0.045792841240713165,
"reg_lambda": 0.0012362651138125363,
"FLAML_sample_size": 436899
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 141,
"num_leaves": 139,
"min_child_samples": 8,
"learning_rate": 0.04824748268727149,
"log_max_bin": 9,
"colsample_bytree": 0.5261441571042451,
"reg_alpha": 0.002896920833899335,
"reg_lambda": 0.024463247502165594
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 31204,
"num_leaves": 4,
"min_child_samples": 3,
"learning_rate": 0.009033979476164342,
"log_max_bin": 10,
"colsample_bytree": 0.5393339924944204,
"reg_alpha": 15.800090067239827,
"reg_lambda": 34.82471227276953
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 362,
"num_leaves": 1208,
"min_child_samples": 8,
"learning_rate": 0.02070742242160566,
"log_max_bin": 4,
"colsample_bytree": 0.37915528071680865,
"reg_alpha": 0.002982599447751338,
"reg_lambda": 1.136605174453919,
"FLAML_sample_size": 337147
}
},
{
"class": "lgbm",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
18000.0,
28.0,
2.0,
0.7565217391304347
],
"scale": [
42124.0,
130.0,
1.0,
0.5714285714285715
]
},
"neighbors": [
{
"features": [
1.196467571930491,
1.0923076923076922,
0.0,
0.4260869565217391
],
"choice": [
4
]
},
{
"features": [
11.096856898680088,
-0.16153846153846155,
0.0,
-0.5739130434782609
],
"choice": [
0,
1,
3,
2,
4
]
},
{
"features": [
8.658152122305575,
0.38461538461538464,
0.0,
-0.7405797101449274
],
"choice": [
2,
1,
0,
3,
4
]
},
{
"features": [
0.27281359794891274,
-0.14615384615384616,
0.0,
-1.3239130434782607
],
"choice": [
0,
1,
4
]
},
{
"features": [
-0.4125676573924604,
-0.1076923076923077,
0.0,
-0.5739130434782609
],
"choice": [
3,
1,
0,
2,
4
]
},
{
"features": [
0.6409647706770487,
1.5538461538461539,
0.0,
0.0
],
"choice": [
2,
0,
1,
4
]
},
{
"features": [
2.3515573069983855,
0.16923076923076924,
0.0,
0.4260869565217391
],
"choice": [
2,
0,
3,
1,
4
]
},
{
"features": [
0.6162045389801538,
-0.1076923076923077,
0.0,
-0.5739130434782609
],
"choice": [
1,
4
]
},
{
"features": [
0.5386240622922799,
-0.09230769230769231,
0.0,
-0.5582880434782608
],
"choice": [
1,
0,
4
]
},
{
"features": [
-0.41133320672300827,
-0.18461538461538463,
0.0,
0.4260869565217391
],
"choice": [
3,
2,
4
]
},
{
"features": [
-0.31155635742094767,
12.36923076923077,
0.0,
0.3865087169129372
],
"choice": [
2,
3,
0,
1,
4
]
},
{
"features": [
-0.40594435476213087,
-0.06153846153846154,
0.0,
-0.7114130434782607
],
"choice": [
1,
0,
4
]
},
{
"features": [
0.0,
32.83076923076923,
0.0,
0.4260869565217391
],
"choice": [
0,
1,
3,
2,
4
]
},
{
"features": [
1.6675766783781218,
0.0,
0.0,
0.4260869565217391
],
"choice": [
0,
1,
4
]
},
{
"features": [
-0.36356946158959264,
0.8923076923076924,
0.0,
-1.2266908212560386
],
"choice": [
3,
4
]
},
{
"features": [
-0.38225239768303104,
-0.05384615384615385,
0.0,
0.4260869565217391
],
"choice": [
3,
1,
0,
4
]
},
{
"features": [
-0.3590352293229513,
0.06153846153846154,
0.0,
-1.3239130434782607
],
"choice": [
4
]
},
{
"features": [
0.3090399772101415,
0.6923076923076923,
0.0,
-0.003997789240972687
],
"choice": [
1,
3,
4
]
},
{
"features": [
-0.3118649700883107,
-0.17692307692307693,
0.0,
0.4260869565217391
],
"choice": [
1,
4
]
},
{
"features": [
0.0,
32.83076923076923,
0.0,
0.4260869565217391
],
"choice": [
0,
1,
3,
4
]
},
{
"features": [
-0.3178473079479632,
-0.06153846153846154,
0.0,
0.4260869565217391
],
"choice": [
1,
0,
4
]
}
],
"configsource": [
"Airlines",
"riccardo",
"fried",
"Dionis",
"default"
]
}

View File

@ -1,416 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 134,
"num_leaves": 225,
"min_child_samples": 21,
"learning_rate": 0.10182098014295998,
"log_max_bin": 5,
"colsample_bytree": 0.6103565306428956,
"reg_alpha": 0.0009765625,
"reg_lambda": 40.413729576022625
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 3726,
"num_leaves": 155,
"min_child_samples": 4,
"learning_rate": 0.040941607728296484,
"log_max_bin": 5,
"colsample_bytree": 0.5326256194627191,
"reg_alpha": 0.7408711930398492,
"reg_lambda": 0.5467731065349226
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 573,
"num_leaves": 16,
"min_child_samples": 52,
"learning_rate": 0.2422782244991656,
"log_max_bin": 7,
"colsample_bytree": 1.0,
"reg_alpha": 0.03433194930183514,
"reg_lambda": 0.03870494540146326
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 2931,
"num_leaves": 106,
"min_child_samples": 49,
"learning_rate": 0.007146230961642236,
"log_max_bin": 7,
"colsample_bytree": 0.46947896116006055,
"reg_alpha": 0.37428758811879526,
"reg_lambda": 23.639977131692564
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 241,
"num_leaves": 58,
"min_child_samples": 2,
"learning_rate": 0.022730855281657265,
"log_max_bin": 5,
"colsample_bytree": 0.5620897082415793,
"reg_alpha": 0.0031614554887399314,
"reg_lambda": 0.02175056245188971
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 8353,
"num_leaves": 371,
"min_child_samples": 71,
"learning_rate": 0.017965875630873252,
"log_max_bin": 10,
"colsample_bytree": 0.9002082433803926,
"reg_alpha": 0.4864366003694002,
"reg_lambda": 0.024138585745106363,
"FLAML_sample_size": 470619
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 320,
"num_leaves": 24,
"min_child_samples": 53,
"learning_rate": 0.019316895546068795,
"log_max_bin": 6,
"colsample_bytree": 0.3955693254372702,
"reg_alpha": 0.0013785083170001627,
"reg_lambda": 0.04644365636517757
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 733,
"num_leaves": 11,
"min_child_samples": 94,
"learning_rate": 0.06276798296942972,
"log_max_bin": 6,
"colsample_bytree": 0.6341928918435795,
"reg_alpha": 0.5811038918218691,
"reg_lambda": 43.304997517523944
}
},
{
"class": "lgbm",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
40337.0,
54.0,
7.0,
1.0
],
"scale": [
58722.0,
766.0,
6.0,
1.0
]
},
"neighbors": [
{
"features": [
8.217925138789552,
0.0,
0.0,
-0.8148148148148149
],
"choice": [
5,
1,
0,
3,
2,
7,
4,
8
]
},
{
"features": [
5.691767991553421,
0.007832898172323759,
58.0,
0.0
],
"choice": [
0,
2,
4,
7,
6,
8
]
},
{
"features": [
0.385937127482034,
0.9530026109660574,
0.5,
0.0
],
"choice": [
3,
7,
0,
4,
1,
8
]
},
{
"features": [
0.3123020333094922,
-0.03524804177545692,
15.5,
0.0
],
"choice": [
3,
0,
7,
6,
1,
4,
5,
2,
8
]
},
{
"features": [
0.5964033922550321,
0.0,
-0.5,
0.0
],
"choice": [
3,
0,
7,
4,
8
]
},
{
"features": [
-0.5336500800381458,
9.328981723237598,
0.5,
0.0
],
"choice": [
3,
0,
4,
1,
2,
7,
6,
8
]
},
{
"features": [
0.20201968597799802,
-0.0587467362924282,
0.0,
0.0
],
"choice": [
4,
6,
1,
7,
5,
3,
0,
2,
8
]
},
{
"features": [
0.20677088655018563,
0.16449086161879894,
0.5,
0.0
],
"choice": [
3,
0,
1,
5,
7,
4,
8
]
},
{
"features": [
-0.6604339089268076,
-0.06266318537859007,
-0.5,
-1.0
],
"choice": [
8
]
},
{
"features": [
-0.6703620448894793,
1.0469973890339426,
0.3333333333333333,
0.0
],
"choice": [
4,
1,
8
]
},
{
"features": [
0.34848949286468445,
-0.015665796344647518,
-0.6666666666666666,
-1.0
],
"choice": [
1,
5,
2,
3,
0,
8
]
},
{
"features": [
-0.5336500800381458,
2.5404699738903394,
-0.3333333333333333,
0.0
],
"choice": [
2,
8
]
},
{
"features": [
-0.5606757263036,
0.9738903394255874,
0.0,
0.0
],
"choice": [
4,
1,
8
]
},
{
"features": [
0.0,
-0.06266318537859007,
-0.6666666666666666,
0.0
],
"choice": [
2,
1,
5,
8
]
},
{
"features": [
-0.6562617077075031,
0.21148825065274152,
0.5,
0.0
],
"choice": [
2,
6,
7,
5,
3,
1,
4,
8
]
},
{
"features": [
-0.6515105071353156,
-0.04960835509138381,
0.0,
0.0
],
"choice": [
6,
1,
3,
7,
5,
4,
0,
2,
8
]
},
{
"features": [
-0.6739552467559007,
-0.04699738903394256,
-0.5,
0.0
],
"choice": [
6,
7,
3,
1,
0,
4,
5,
8
]
}
],
"configsource": [
"Helena",
"connect-4",
"jungle_chess_2pcs_raw_endgame_complete",
"Jannis",
"fabert",
"Covertype",
"segment",
"APSFailure",
"default"
]
}

View File

@ -1,281 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 4797,
"num_leaves": 122,
"min_child_samples": 2,
"learning_rate": 0.022635758411078528,
"log_max_bin": 9,
"colsample_bytree": 0.7019911744574896,
"reg_alpha": 0.004252223402511765,
"reg_lambda": 0.11288241427227624
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 1009,
"num_leaves": 42,
"min_child_samples": 12,
"learning_rate": 0.02167229637171611,
"log_max_bin": 7,
"colsample_bytree": 0.7385038460573171,
"reg_alpha": 0.003607184551842614,
"reg_lambda": 12.08340803550741
}
},
{
"class": "lgbm",
"hyperparameters": {
"n_estimators": 32767,
"num_leaves": 372,
"min_child_samples": 4,
"learning_rate": 0.03517259015200922,
"log_max_bin": 5,
"colsample_bytree": 1.0,
"reg_alpha": 0.02271142170225636,
"reg_lambda": 0.001963791798843179,
"FLAML_sample_size": 830258
}
},
{
"class": "lgbm",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
36691.0,
10.0,
0.0,
1.0
],
"scale": [
140856.0,
3.0,
1.0,
0.33333333333333337
]
},
"neighbors": [
{
"features": [
0.0,
0.0,
0.0,
0.0
],
"choice": [
3
]
},
{
"features": [
-0.17263020389617767,
10.0,
0.0,
0.0
],
"choice": [
1,
0,
3
]
},
{
"features": [
6.129018288180837,
-0.3333333333333333,
0.0,
-1.0
],
"choice": [
1,
3
]
},
{
"features": [
0.48478588061566424,
-0.3333333333333333,
0.0,
-2.666666666666666
],
"choice": [
1,
3
]
},
{
"features": [
-0.14869796103822344,
-0.3333333333333333,
0.0,
-1.0
],
"choice": [
1,
3
]
},
{
"features": [
-0.06175100812176975,
-0.3333333333333333,
0.0,
-2.333333333333333
],
"choice": [
3
]
},
{
"features": [
6.129018288180837,
2.6666666666666665,
0.0,
-1.333333333333333
],
"choice": [
0,
1,
2,
3
]
},
{
"features": [
6.129018288180837,
0.0,
0.0,
-2.6999999999999997
],
"choice": [
1,
3
]
},
{
"features": [
0.8713934798659624,
0.0,
0.0,
0.0
],
"choice": [
1,
3
]
},
{
"features": [
-0.19217498722099166,
0.6666666666666666,
0.0,
-1.0
],
"choice": [
0,
3
]
},
{
"features": [
0.0,
0.0,
0.0,
0.0
],
"choice": [
1,
0,
3
]
},
{
"features": [
-0.11491168285341058,
2.0,
0.0,
0.0
],
"choice": [
1,
3
]
},
{
"features": [
-0.11491168285341058,
-0.6666666666666666,
0.0,
0.0
],
"choice": [
3
]
},
{
"features": [
-0.1286065201340376,
-0.6666666666666666,
0.0,
0.0
],
"choice": [
0,
1,
3
]
},
{
"features": [
0.0,
0.0,
0.0,
-0.9
],
"choice": [
3
]
},
{
"features": [
6.288819787584483,
0.0,
0.0,
0.0
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
-0.16464332367808257,
12.666666666666666,
0.0,
0.0
],
"choice": [
0,
3
]
}
],
"configsource": [
"houses",
"house_8L",
"poker",
"default"
]
}

View File

@ -1,222 +0,0 @@
import pandas as pd
import numpy as np
import argparse
from pathlib import Path
import json
from sklearn.preprocessing import RobustScaler
from flaml.default import greedy
from flaml.default.regret import load_result, build_regret
from flaml.version import __version__
regret_bound = 0.01
def config_predictor_tuple(tasks, configs, meta_features, regret_matrix):
"""Config predictor represented in tuple.
The returned tuple consists of (meta_features, preferences, proc).
Returns:
meta_features_norm: A dataframe of normalized meta features, each column for a task.
preferences: A dataframe of sorted configuration indicies by their performance per task (column).
regret_matrix: A dataframe of the configuration(row)-task(column) regret matrix.
"""
# pre-processing
scaler = RobustScaler()
meta_features_norm = meta_features.loc[tasks] # this makes a copy
meta_features_norm.loc[:, :] = scaler.fit_transform(meta_features_norm)
proc = {
"center": scaler.center_.tolist(),
"scale": scaler.scale_.tolist(),
}
# best model for each dataset in training
# choices = regret_matrix[tasks].loc[configs].reset_index(drop=True).idxmin()
# break ties using the order in configs
regret = (
regret_matrix[tasks]
.loc[configs]
.reset_index(drop=True)
.apply(lambda row: row.apply(lambda x: (x, row.name)), axis=1)
)
print(regret)
preferences = pd.DataFrame(np.argsort(regret, axis=0), columns=regret.columns)
print(preferences)
return (meta_features_norm, preferences, proc)
def build_portfolio(meta_features, regret, strategy):
"""Build a portfolio from meta features and regret matrix.
Args:
meta_features: A dataframe of metafeatures matrix.
regret: A dataframe of regret matrix.
strategy: A str of the strategy, one of ("greedy", "greedy-feedback").
"""
assert strategy in ("greedy", "greedy-feedback")
if strategy == "greedy":
portfolio = greedy.construct_portfolio(regret, None, regret_bound)
elif strategy == "greedy-feedback":
portfolio = greedy.construct_portfolio(regret, meta_features, regret_bound)
if "default" not in portfolio and "default" in regret.index:
portfolio += ["default"]
return portfolio
def load_json(filename):
"""Returns the contents of json file filename."""
with open(filename, "r") as f:
return json.load(f)
def _filter(preference, regret):
"""Remove choices after default or have NaN regret."""
try:
last = regret.index.get_loc("default") # len(preference) - 1
preference = preference[: preference[preference == last].index[0] + 1]
except KeyError: # no "default"
pass
finally:
regret = regret.reset_index(drop=True)
preference = preference[regret[preference].notna().to_numpy()]
# regret = regret[preference].reset_index(drop=True)
# dup = regret[regret.duplicated()]
# if not dup.empty:
# # break ties using the order in configs
# unique = dup.drop_duplicates()
# for u in unique:
# subset = regret == u
# preference[subset].sort_values(inplace=True)
# # raise ValueError(preference)
return preference.tolist()
def serialize(configs, regret, meta_features, output_file, config_path):
"""Store to disk all information FLAML-metalearn needs at runtime.
configs: names of model configs
regret: regret matrix
meta_features: task metafeatures
output_file: filename
config_path: path containing config json files
"""
output_file = Path(output_file)
# delete if exists
try:
output_file.unlink()
except FileNotFoundError:
pass
meta_features_norm, preferences, proc = config_predictor_tuple(regret.columns, configs, meta_features, regret)
portfolio = [load_json(config_path.joinpath(m + ".json")) for m in configs]
regret = regret.loc[configs]
meta_predictor = {
"version": __version__,
"meta_feature_names": list(meta_features.columns),
"portfolio": portfolio,
"preprocessing": proc,
"neighbors": [
{"features": x.tolist(), "choice": _filter(preferences[y], regret[y])}
for x, y in zip(meta_features_norm.to_records(index=False), preferences.columns)
],
"configsource": list(configs),
}
with open(output_file, "w+") as f:
json.dump(meta_predictor, f, indent=4)
return meta_predictor
# def analyze(regret_matrix, meta_predictor):
# tasks = regret_matrix.columns
# neighbors = meta_predictor["neighbors"]
# from sklearn.neighbors import NearestNeighbors
# nn = NearestNeighbors(n_neighbors=1)
# for i, task in enumerate(neighbors):
# other_tasks = [j for j in range(len(neighbors)) if j != i]
# # find the nn and the regret
# nn.fit([neighbors[j]["features"] for j in other_tasks])
# dist, ind = nn.kneighbors(
# np.array(task["features"]).reshape(1, -1), return_distance=True
# )
# ind = other_tasks[int(ind.item())]
# choice = int(neighbors[ind]["choice"][0])
# r = regret_matrix.iloc[choice, i]
# if r > regret_bound:
# label = "outlier"
# else:
# label = "normal"
# print(tasks[i], label, tasks[ind], "dist", dist, "regret", r)
# # find the best model and the regret
# regrets = regret_matrix.iloc[other_tasks, i]
# best = regrets.min()
# if best > regret_bound:
# print(tasks[i], "best_regret", best, "task", regrets.idxmin())
def main():
parser = argparse.ArgumentParser(description="Build a portfolio.")
parser.add_argument("--strategy", help="One of {greedy, greedy-feedback}", default="greedy")
parser.add_argument("--input", help="Input path")
parser.add_argument("--metafeatures", help="CSV of task metafeatures")
parser.add_argument("--exclude", help="One task name to exclude (for LOO purposes)")
parser.add_argument("--output", help="Location to write portfolio JSON")
parser.add_argument("--task", help="Task to merge portfolios", default="binary")
parser.add_argument(
"--estimator",
help="Estimators to merge portfolios",
default=["lgbm", "xgboost"],
nargs="+",
)
args = parser.parse_args()
meta_features = pd.read_csv(args.metafeatures, index_col=0).groupby(level=0).first()
if args.exclude:
meta_features.drop(args.exclude, inplace=True)
baseline_best = None
all_results = None
for estimator in args.estimator:
# produce regret
all, baseline = load_result(f"{args.input}/{estimator}/results.csv", args.task, "result")
regret = build_regret(all, baseline)
regret = regret.replace(np.inf, np.nan).dropna(axis=1, how="all")
if args.exclude:
regret = regret.loc[[i for i in regret.index if args.exclude not in i]]
regret = regret[[c for c in regret.columns if args.exclude not in c]]
print(f"Regret matrix complete: {100 * regret.count().sum() / regret.shape[0] / regret.shape[1]}%")
print(f"Num models considered: {regret.shape[0]}")
configs = build_portfolio(meta_features, regret, args.strategy)
meta_predictor = serialize(
configs,
regret,
meta_features,
f"{args.output}/{estimator}/{args.task}.json",
Path(f"{args.input}/{estimator}"),
)
configsource = meta_predictor["configsource"]
all = all.loc[configsource]
all.rename({x: f"{estimator}/{x}" for x in regret.index.values}, inplace=True)
baseline_best = baseline if baseline_best is None else pd.DataFrame({0: baseline_best, 1: baseline}).max(1)
all_results = all if all_results is None else pd.concat([all_results, all])
# analyze(regret, meta_predictor)
regrets = build_regret(all_results, baseline_best)
if len(args.estimator) > 1:
meta_predictor = serialize(
regrets.index,
regrets,
meta_features,
f"{args.output}/all/{args.task}.json",
Path(args.input),
)
if __name__ == "__main__":
# execute only if run as a script
main()

View File

@ -1,42 +0,0 @@
import argparse
from os import path
import pandas as pd
def build_regret(all, baseline):
all = all[all.columns.intersection(baseline.index)]
return baseline - all
def write_regret(regret, filename):
regret.to_csv(filename)
def load_result(filename, task_type, metric):
df = pd.read_csv(filename)
df = df.loc[
(df[metric].notnull()) & (df.type == task_type),
["task", "fold", "params", metric],
]
df["params"] = df["params"].apply(lambda x: path.splitext(path.basename(eval(x)["_modeljson"]))[0])
baseline = df.loc[df["task"] == df["params"], ["task", metric]].groupby("task").mean()[metric]
df = df.pivot_table(index="params", columns="task", values=metric)
return df, baseline
def main():
parser = argparse.ArgumentParser(description="Build a regret matrix.")
parser.add_argument("--result_csv", help="File of experiment results")
parser.add_argument("--task_type", help="Type of task")
parser.add_argument("--metric", help="Metric for calculating regret", default="result")
parser.add_argument("--output", help="Location to write regret CSV to")
args = parser.parse_args()
all, baseline = load_result(args.result_csv, args.task_type, args.metric)
regret = build_regret(all, baseline)
write_regret(regret, args.output)
if __name__ == "__main__":
# execute only if run as a script
main()

View File

@ -1,333 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "rf",
"hyperparameters": {
"n_estimators": 501,
"max_features": 0.24484242524861066,
"max_leaves": 1156,
"criterion": "entropy"
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 356,
"max_features": 0.1,
"max_leaves": 102,
"criterion": "gini"
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 1000,
"max_features": 0.1779692423238241,
"max_leaves": 7499,
"criterion": "gini"
}
},
{
"class": "rf",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
18000.0,
28.0,
2.0,
0.7565217391304347
],
"scale": [
42124.0,
130.0,
1.0,
0.5714285714285715
]
},
"neighbors": [
{
"features": [
1.196467571930491,
1.0923076923076922,
0.0,
0.4260869565217391
],
"choice": [
0,
3
]
},
{
"features": [
11.096856898680088,
-0.16153846153846155,
0.0,
-0.5739130434782609
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
8.658152122305575,
0.38461538461538464,
0.0,
-0.7405797101449274
],
"choice": [
2,
0,
3
]
},
{
"features": [
0.27281359794891274,
-0.14615384615384616,
0.0,
-1.3239130434782607
],
"choice": [
2,
0,
3
]
},
{
"features": [
-0.4125676573924604,
-0.1076923076923077,
0.0,
-0.5739130434782609
],
"choice": [
2,
1,
0,
3
]
},
{
"features": [
0.6409647706770487,
1.5538461538461539,
0.0,
0.0
],
"choice": [
1,
0,
2,
3
]
},
{
"features": [
2.3515573069983855,
0.16923076923076924,
0.0,
0.4260869565217391
],
"choice": [
2,
0,
3
]
},
{
"features": [
0.6162045389801538,
-0.1076923076923077,
0.0,
-0.5739130434782609
],
"choice": [
0,
2,
1,
3
]
},
{
"features": [
0.5386240622922799,
-0.09230769230769231,
0.0,
-0.5582880434782608
],
"choice": [
0,
2,
3
]
},
{
"features": [
-0.41133320672300827,
-0.18461538461538463,
0.0,
0.4260869565217391
],
"choice": [
1,
2,
0,
3
]
},
{
"features": [
-0.31155635742094767,
12.36923076923077,
0.0,
0.3865087169129372
],
"choice": [
0,
2,
1,
3
]
},
{
"features": [
-0.40594435476213087,
-0.06153846153846154,
0.0,
-0.7114130434782607
],
"choice": [
0,
2,
3
]
},
{
"features": [
0.0,
32.83076923076923,
0.0,
0.4260869565217391
],
"choice": [
0,
2,
3
]
},
{
"features": [
1.6675766783781218,
0.0,
0.0,
0.4260869565217391
],
"choice": [
2,
0,
3
]
},
{
"features": [
-0.36356946158959264,
0.8923076923076924,
0.0,
-1.2266908212560386
],
"choice": [
2,
0,
3
]
},
{
"features": [
-0.38225239768303104,
-0.05384615384615385,
0.0,
0.4260869565217391
],
"choice": [
3
]
},
{
"features": [
-0.3590352293229513,
0.06153846153846154,
0.0,
-1.3239130434782607
],
"choice": [
0,
3
]
},
{
"features": [
0.3090399772101415,
0.6923076923076923,
0.0,
-0.003997789240972687
],
"choice": [
0,
2,
3
]
},
{
"features": [
-0.3118649700883107,
-0.17692307692307693,
0.0,
0.4260869565217391
],
"choice": [
2,
0,
3
]
},
{
"features": [
0.0,
32.83076923076923,
0.0,
0.4260869565217391
],
"choice": [
3
]
},
{
"features": [
-0.3178473079479632,
-0.06153846153846154,
0.0,
0.4260869565217391
],
"choice": [
0,
3
]
}
],
"configsource": [
"Amazon_employee_access",
"kc1",
"Helena",
"default"
]
}

View File

@ -1,328 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "rf",
"hyperparameters": {
"n_estimators": 1000,
"max_features": 0.1779692423238241,
"max_leaves": 7499,
"criterion": "gini"
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 400,
"max_features": 0.8961466398827462,
"max_leaves": 25095,
"criterion": "entropy",
"FLAML_sample_size": 470620
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 470,
"max_features": 0.12698484669953783,
"max_leaves": 31499,
"criterion": "entropy"
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 632,
"max_features": 1.0,
"max_leaves": 1360,
"criterion": "entropy"
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 1713,
"max_features": 0.40966311008832224,
"max_leaves": 10210,
"criterion": "entropy",
"FLAML_sample_size": 105352
}
},
{
"class": "rf",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
40337.0,
54.0,
7.0,
1.0
],
"scale": [
58722.0,
766.0,
6.0,
1.0
]
},
"neighbors": [
{
"features": [
8.217925138789552,
0.0,
0.0,
-0.8148148148148149
],
"choice": [
1,
4,
5
]
},
{
"features": [
5.691767991553421,
0.007832898172323759,
58.0,
0.0
],
"choice": [
0,
2,
5
]
},
{
"features": [
0.385937127482034,
0.9530026109660574,
0.5,
0.0
],
"choice": [
4,
2,
1,
3,
0,
5
]
},
{
"features": [
0.3123020333094922,
-0.03524804177545692,
15.5,
0.0
],
"choice": [
0,
3,
2,
1,
5
]
},
{
"features": [
0.5964033922550321,
0.0,
-0.5,
0.0
],
"choice": [
4,
1,
3,
0,
2,
5
]
},
{
"features": [
-0.5336500800381458,
9.328981723237598,
0.5,
0.0
],
"choice": [
0,
2,
5
]
},
{
"features": [
0.20201968597799802,
-0.0587467362924282,
0.0,
0.0
],
"choice": [
1,
4,
5
]
},
{
"features": [
0.20677088655018563,
0.16449086161879894,
0.5,
0.0
],
"choice": [
4,
1,
2,
0,
3,
5
]
},
{
"features": [
-0.6604339089268076,
-0.06266318537859007,
-0.5,
-1.0
],
"choice": [
3,
1,
5
]
},
{
"features": [
-0.6703620448894793,
1.0469973890339426,
0.3333333333333333,
0.0
],
"choice": [
0,
5
]
},
{
"features": [
0.34848949286468445,
-0.015665796344647518,
-0.6666666666666666,
-1.0
],
"choice": [
4,
2,
0,
5
]
},
{
"features": [
-0.5336500800381458,
2.5404699738903394,
-0.3333333333333333,
0.0
],
"choice": [
4,
3,
1,
2,
0,
5
]
},
{
"features": [
-0.5606757263036,
0.9738903394255874,
0.0,
0.0
],
"choice": [
2,
4,
0,
3,
1,
5
]
},
{
"features": [
0.0,
-0.06266318537859007,
-0.6666666666666666,
0.0
],
"choice": [
3,
1,
4,
0,
5
]
},
{
"features": [
-0.6562617077075031,
0.21148825065274152,
0.5,
0.0
],
"choice": [
4,
0,
3,
1,
2,
5
]
},
{
"features": [
-0.6515105071353156,
-0.04960835509138381,
0.0,
0.0
],
"choice": [
1,
4,
3,
5
]
},
{
"features": [
-0.6739552467559007,
-0.04699738903394256,
-0.5,
0.0
],
"choice": [
3,
1,
4,
5
]
}
],
"configsource": [
"Helena",
"Covertype",
"Fashion-MNIST",
"jungle_chess_2pcs_raw_endgame_complete",
"MiniBooNE",
"default"
]
}

View File

@ -1,293 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "rf",
"hyperparameters": {
"n_estimators": 960,
"max_features": 0.694616932858775,
"max_leaves": 8937
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 2047,
"max_features": 1.0,
"max_leaves": 32767,
"FLAML_sample_size": 830258
}
},
{
"class": "rf",
"hyperparameters": {
"n_estimators": 2047,
"max_features": 0.6683903035731483,
"max_leaves": 591,
"criterion": "entropy"
}
},
{
"class": "rf",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
36691.0,
10.0,
0.0,
0.85
],
"scale": [
460950.5,
5.5,
1.0,
0.48611111111111116
]
},
"neighbors": [
{
"features": [
0.0,
0.0,
0.0,
0.3085714285714286
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
-0.052751868150701646,
5.454545454545454,
0.0,
0.3085714285714286
],
"choice": [
0,
3
]
},
{
"features": [
1.8728887375108607,
-0.18181818181818182,
0.0,
-0.3771428571428571
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
0.14813955077605948,
-0.18181818181818182,
0.0,
-1.52
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
-0.04543871847410948,
-0.18181818181818182,
0.0,
-0.3771428571428571
],
"choice": [
2,
1,
0,
3
]
},
{
"features": [
-0.018869705098486712,
-0.18181818181818182,
0.0,
-1.2914285714285714
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
1.8728887375108607,
1.4545454545454546,
0.0,
-0.6057142857142855
],
"choice": [
0,
3
]
},
{
"features": [
1.8728887375108607,
0.0,
0.0,
-1.5428571428571427
],
"choice": [
0,
2,
1,
3
]
},
{
"features": [
0.266278049378404,
0.0,
0.0,
0.3085714285714286
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
0.0,
0.0,
0.0,
0.3085714285714286
],
"choice": [
1,
0,
3
]
},
{
"features": [
-0.035114399485411125,
1.0909090909090908,
0.0,
0.3085714285714286
],
"choice": [
3
]
},
{
"features": [
-0.035114399485411125,
-0.36363636363636365,
0.0,
0.3085714285714286
],
"choice": [
0,
2,
1,
3
]
},
{
"features": [
-0.03929923061152987,
-0.36363636363636365,
0.0,
0.3085714285714286
],
"choice": [
0,
1,
3
]
},
{
"features": [
0.0,
0.0,
0.0,
-0.3085714285714286
],
"choice": [
1,
3
]
},
{
"features": [
1.056425798431719,
1.4545454545454546,
0.0,
-0.7199999999999999
],
"choice": [
3
]
},
{
"features": [
0.6902650067631991,
-0.18181818181818182,
0.0,
-1.0628571428571427
],
"choice": [
1,
3
]
},
{
"features": [
1.92172044503694,
0.0,
0.0,
0.3085714285714286
],
"choice": [
3
]
},
{
"features": [
-0.050311259018050745,
6.909090909090909,
0.0,
0.3085714285714286
],
"choice": [
0,
2,
1,
3
]
}
],
"configsource": [
"houses",
"poker",
"bank-marketing",
"default"
]
}

View File

@ -1,261 +0,0 @@
import numpy as np
import logging
import pathlib
import json
from flaml.automl.data import DataTransformer
from flaml.automl.task.task import CLASSIFICATION, get_classification_objective
from flaml.automl.task.generic_task import len_labels
from flaml.automl.task.factory import task_factory
from flaml.version import __version__
try:
from sklearn.neighbors import NearestNeighbors
except ImportError:
pass
LOCATION = pathlib.Path(__file__).parent.resolve()
logger = logging.getLogger(__name__)
CONFIG_PREDICTORS = {}
def meta_feature(task, X_train, y_train, meta_feature_names):
this_feature = []
n_row = X_train.shape[0]
n_feat = X_train.shape[1]
is_classification = task in CLASSIFICATION
for each_feature_name in meta_feature_names:
if each_feature_name == "NumberOfInstances":
this_feature.append(n_row)
elif each_feature_name == "NumberOfFeatures":
this_feature.append(n_feat)
elif each_feature_name == "NumberOfClasses":
this_feature.append(len_labels(y_train) if is_classification else 0)
elif each_feature_name == "PercentageOfNumericFeatures":
try:
# this feature is only supported for dataframe
this_feature.append(
X_train.select_dtypes(include=[np.number, "float", "int", "long"]).shape[1] / n_feat
)
except AttributeError:
# 'numpy.ndarray' object has no attribute 'select_dtypes'
this_feature.append(1) # all features are numeric
else:
raise ValueError("Feature {} not implemented. ".format(each_feature_name))
return this_feature
def load_config_predictor(estimator_name, task, location=None):
task = str(task)
key = f"{location}/{estimator_name}/{task}"
predictor = CONFIG_PREDICTORS.get(key)
if predictor:
return predictor
task = "multiclass" if task == "multi" else task # TODO: multi -> multiclass?
try:
location = location or LOCATION
with open(f"{location}/{estimator_name}/{task}.json", "r") as f:
CONFIG_PREDICTORS[key] = predictor = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Portfolio has not been built for {estimator_name} on {task} task.")
return predictor
def suggest_config(
task,
X,
y,
estimator_or_predictor,
location=None,
k=None,
meta_feature_fn=meta_feature,
):
"""Suggest a list of configs for the given task and training data.
The returned configs can be used as starting points for AutoML.fit().
`FLAML_sample_size` is removed from the configs.
"""
from packaging.version import parse as version_parse
task = get_classification_objective(len_labels(y)) if task == "classification" and y is not None else task
predictor = (
load_config_predictor(estimator_or_predictor, task, location)
if isinstance(estimator_or_predictor, str)
else estimator_or_predictor
)
older_version = "1.0.2"
# TODO: update older_version when the newer code can no longer handle the older version json file
assert version_parse(__version__) >= version_parse(predictor["version"]) >= version_parse(older_version)
prep = predictor["preprocessing"]
feature = meta_feature_fn(task, X_train=X, y_train=y, meta_feature_names=predictor["meta_feature_names"])
feature = (np.array(feature) - np.array(prep["center"])) / np.array(prep["scale"])
neighbors = predictor["neighbors"]
nn = NearestNeighbors(n_neighbors=1)
nn.fit([x["features"] for x in neighbors])
dist, ind = nn.kneighbors(feature.reshape(1, -1), return_distance=True)
logger.info(f"metafeature distance: {dist.item()}")
ind = int(ind.item())
choice = neighbors[ind]["choice"] if k is None else neighbors[ind]["choice"][:k]
configs = [predictor["portfolio"][x] for x in choice]
for config in configs:
if "hyperparameters" in config:
hyperparams = config["hyperparameters"]
if hyperparams and "FLAML_sample_size" in hyperparams:
hyperparams.pop("FLAML_sample_size")
return configs
def suggest_learner(task, X, y, estimator_or_predictor="all", estimator_list=None, location=None):
"""Suggest best learner within estimator_list."""
configs = suggest_config(task, X, y, estimator_or_predictor, location)
if not estimator_list:
return configs[0]["class"]
for c in configs:
if c["class"] in estimator_list:
return c["class"]
return estimator_list[0]
def suggest_hyperparams(task, X, y, estimator_or_predictor, location=None):
"""Suggest hyperparameter configurations and an estimator class.
The configurations can be used to initialize the estimator class like lightgbm.LGBMRegressor.
Example:
```python
hyperparams, estimator_class = suggest_hyperparams("regression", X_train, y_train, "lgbm")
model = estimator_class(**hyperparams) # estimator_class is LGBMRegressor
model.fit(X_train, y_train)
```
Args:
task: A string of the task type, e.g.,
'classification', 'regression', 'ts_forecast', 'rank',
'seq-classification', 'seq-regression'.
X: A dataframe of training data in shape n*m.
For 'ts_forecast' task, the first column of X_train
must be the timestamp column (datetime type). Other
columns in the dataframe are assumed to be exogenous
variables (categorical or numeric).
y: A series of labels in shape n*1.
estimator_or_predictor: A str of the learner name or a dict of the learned config predictor.
If a dict, it contains:
- "version": a str of the version number.
- "preprocessing": a dictionary containing:
* "center": a list of meta feature value offsets for normalization.
* "scale": a list of meta feature scales to normalize each dimension.
- "neighbors": a list of dictionaries. Each dictionary contains:
* "features": a list of the normalized meta features for a neighbor.
* "choice": an integer of the configuration id in the portfolio.
- "portfolio": a list of dictionaries, each corresponding to a configuration:
* "class": a str of the learner name.
* "hyperparameters": a dict of the config. The key "FLAML_sample_size" will be ignored.
location: (Optional) A str of the location containing mined portfolio file.
Only valid when the portfolio is a str, by default the location is flaml/default.
Returns:
hyperparams: A dict of the hyperparameter configurations.
estiamtor_class: A class of the underlying estimator, e.g., lightgbm.LGBMClassifier.
"""
config = suggest_config(task, X, y, estimator_or_predictor, location=location, k=1)[0]
estimator = config["class"]
task = task_factory(task)
model_class = task.estimator_class_from_str(estimator)
hyperparams = config["hyperparameters"]
model = model_class(task=task.name, **hyperparams)
estimator_class = model.estimator_class
hyperparams = hyperparams and model.params
return hyperparams, estimator_class
class AutoMLTransformer:
def __init__(self, model, data_transformer):
self._model = model
self._dt = data_transformer
def transform(self, X):
return self._model._preprocess(self._dt.transform(X))
def preprocess_and_suggest_hyperparams(
task,
X,
y,
estimator_or_predictor,
location=None,
):
"""Preprocess the data and suggest hyperparameters.
Example:
```python
hyperparams, estimator_class, X, y, feature_transformer, label_transformer = \
preprocess_and_suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth")
model = estimator_class(**hyperparams) # estimator_class is XGBClassifier
model.fit(X, y)
X_test = feature_transformer.transform(X_test)
y_pred = label_transformer.inverse_transform(pd.Series(model.predict(X_test).astype(int)))
```
Args:
task: A string of the task type, e.g.,
'classification', 'regression', 'ts_forecast', 'rank',
'seq-classification', 'seq-regression'.
X: A dataframe of training data in shape n*m.
For 'ts_forecast' task, the first column of X_train
must be the timestamp column (datetime type). Other
columns in the dataframe are assumed to be exogenous
variables (categorical or numeric).
y: A series of labels in shape n*1.
estimator_or_predictor: A str of the learner name or a dict of the learned config predictor.
"choose_xgb" means choosing between xgb_limitdepth and xgboost.
If a dict, it contains:
- "version": a str of the version number.
- "preprocessing": a dictionary containing:
* "center": a list of meta feature value offsets for normalization.
* "scale": a list of meta feature scales to normalize each dimension.
- "neighbors": a list of dictionaries. Each dictionary contains:
* "features": a list of the normalized meta features for a neighbor.
* "choice": a integer of the configuration id in the portfolio.
- "portfolio": a list of dictionaries, each corresponding to a configuration:
* "class": a str of the learner name.
* "hyperparameters": a dict of the config. They key "FLAML_sample_size" will be ignored.
location: (Optional) A str of the location containing mined portfolio file.
Only valid when the portfolio is a str, by default the location is flaml/default.
Returns:
hyperparams: A dict of the hyperparameter configurations.
estiamtor_class: A class of the underlying estimator, e.g., lightgbm.LGBMClassifier.
X: the preprocessed X.
y: the preprocessed y.
feature_transformer: a data transformer that can be applied to X_test.
label_transformer: a label transformer that can be applied to y_test.
"""
dt = DataTransformer()
X, y = dt.fit_transform(X, y, task)
if "choose_xgb" == estimator_or_predictor:
# choose between xgb_limitdepth and xgboost
estimator_or_predictor = suggest_learner(
task,
X,
y,
estimator_list=["xgb_limitdepth", "xgboost"],
location=location,
)
config = suggest_config(task, X, y, estimator_or_predictor, location=location, k=1)[0]
estimator = config["class"]
model_class = task_factory(task).estimator_class_from_str(estimator)
hyperparams = config["hyperparameters"]
model = model_class(task=task, **hyperparams)
if model.estimator_class is None:
return hyperparams, model_class, X, y, None, None
else:
estimator_class = model.estimator_class
X = model._preprocess(X)
hyperparams = hyperparams and model.params
transformer = AutoMLTransformer(model, dt)
return hyperparams, estimator_class, X, y, transformer, dt.label_transformer

View File

@ -1,329 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 877,
"max_depth": 11,
"min_child_weight": 0.6205465771093738,
"learning_rate": 0.013622118381700795,
"subsample": 0.566692814245426,
"colsample_bylevel": 0.8865741642101924,
"colsample_bytree": 1.0,
"reg_alpha": 0.01386336444764391,
"reg_lambda": 3.113947886074155
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 5457,
"max_depth": 6,
"min_child_weight": 0.19978269031877885,
"learning_rate": 0.003906732665632749,
"subsample": 0.8207785234496902,
"colsample_bylevel": 0.8438751931476698,
"colsample_bytree": 0.42202862997585794,
"reg_alpha": 0.017372558844968737,
"reg_lambda": 0.03977802121721031
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 3526,
"max_depth": 13,
"min_child_weight": 0.0994486725676356,
"learning_rate": 0.0009765625,
"subsample": 0.46123759274652554,
"colsample_bylevel": 1.0,
"colsample_bytree": 0.4498813776397717,
"reg_alpha": 0.002599398546499414,
"reg_lambda": 0.028336396854402753
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
18000.0,
21.0,
2.0,
0.7565217391304347
],
"scale": [
39542.5,
143.0,
1.0,
0.5714285714285715
]
},
"neighbors": [
{
"features": [
1.2745779857115762,
1.0419580419580419,
0.0,
0.4260869565217391
],
"choice": [
0,
1,
3
]
},
{
"features": [
11.821306189542897,
-0.0979020979020979,
0.0,
-0.5739130434782609
],
"choice": [
0,
2,
3
]
},
{
"features": [
0.290624012138838,
-0.08391608391608392,
0.0,
-1.3239130434782607
],
"choice": [
2,
1,
0,
3
]
},
{
"features": [
-0.4395018018587596,
-0.04895104895104895,
0.0,
-0.5739130434782609
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
0.68280963520263,
1.4615384615384615,
0.0,
0.0
],
"choice": [
1,
2,
0,
3
]
},
{
"features": [
0.65643295188721,
-0.04895104895104895,
0.0,
-0.5739130434782609
],
"choice": [
1,
3
]
},
{
"features": [
0.5737876967819435,
-0.03496503496503497,
0.0,
-0.5582880434782608
],
"choice": [
2,
1,
0,
3
]
},
{
"features": [
-0.4381867610798508,
-0.11888111888111888,
0.0,
0.4260869565217391
],
"choice": [
0,
1,
3
]
},
{
"features": [
-0.3318960611999747,
11.293706293706293,
0.0,
0.3865087169129372
],
"choice": [
1,
0,
2,
3
]
},
{
"features": [
-0.432446102294999,
-0.006993006993006993,
0.0,
-0.7114130434782607
],
"choice": [
0,
1,
3
]
},
{
"features": [
0.0,
29.895104895104897,
0.0,
0.4260869565217391
],
"choice": [
0,
1,
2,
3
]
},
{
"features": [
1.7764430675855092,
0.04895104895104895,
0.0,
0.4260869565217391
],
"choice": [
0,
1,
2,
3
]
},
{
"features": [
-0.3873047986343807,
0.8601398601398601,
0.0,
-1.2266908212560386
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
-0.40720743503824997,
0.0,
0.0,
0.4260869565217391
],
"choice": [
1,
0,
2,
3
]
},
{
"features": [
-0.38247455269646585,
0.1048951048951049,
0.0,
-1.3239130434782607
],
"choice": [
0,
1,
3
]
},
{
"features": [
0.32921540115066067,
0.6783216783216783,
0.0,
-0.003997789240972687
],
"choice": [
0,
1,
3
]
},
{
"features": [
-0.3322248213947019,
-0.11188811188811189,
0.0,
0.4260869565217391
],
"choice": [
0,
3
]
},
{
"features": [
0.0,
29.895104895104897,
0.0,
0.4260869565217391
],
"choice": [
0,
1,
3
]
},
{
"features": [
-0.3385977113232598,
-0.006993006993006993,
0.0,
0.4260869565217391
],
"choice": [
1,
3
]
}
],
"configsource": [
"Jannis",
"adult",
"Amazon_employee_access",
"default"
]
}

View File

@ -1,357 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 1191,
"max_depth": 13,
"min_child_weight": 6.4007885677724605,
"learning_rate": 0.037622775650237326,
"subsample": 1.0,
"colsample_bylevel": 0.3697773165627811,
"colsample_bytree": 0.813871237069598,
"reg_alpha": 0.0009765625,
"reg_lambda": 1.075702708240612
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 1499,
"max_depth": 11,
"min_child_weight": 0.07563529776156448,
"learning_rate": 0.039042609221240955,
"subsample": 0.7832981935783824,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.0009765625,
"reg_lambda": 23.513066752844153
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 313,
"max_depth": 7,
"min_child_weight": 30.424259012001368,
"learning_rate": 0.08466828646360688,
"subsample": 0.9897083979469301,
"colsample_bylevel": 0.6769490906308069,
"colsample_bytree": 1.0,
"reg_alpha": 0.0014544085935366477,
"reg_lambda": 34.09911172306857
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 566,
"max_depth": 13,
"min_child_weight": 0.013176186839973599,
"learning_rate": 0.09285619488896565,
"subsample": 0.5897287493640815,
"colsample_bylevel": 0.923664288991597,
"colsample_bytree": 0.8244714790646485,
"reg_alpha": 0.023484974838756726,
"reg_lambda": 0.5690298249126402,
"FLAML_sample_size": 470620
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 971,
"max_depth": 8,
"min_child_weight": 0.0044052948947322645,
"learning_rate": 0.15171239415469703,
"subsample": 0.8340342805529243,
"colsample_bylevel": 0.9489310919814007,
"colsample_bytree": 0.022724724669028674,
"reg_alpha": 0.0009765625,
"reg_lambda": 0.0025897714798936954
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 464,
"max_depth": 2,
"min_child_weight": 0.0068282719220722,
"learning_rate": 0.07962498837600937,
"subsample": 0.47139986510869014,
"colsample_bylevel": 0.4814471959023239,
"colsample_bytree": 0.6050207253592859,
"reg_alpha": 0.0010290828959872173,
"reg_lambda": 0.0103104214002687
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 1799,
"max_depth": 3,
"min_child_weight": 0.0010034151843327725,
"learning_rate": 0.03453775119035777,
"subsample": 0.31322065037892344,
"colsample_bylevel": 1.0,
"colsample_bytree": 0.2219038021462818,
"reg_alpha": 0.03885163786709896,
"reg_lambda": 1.1077175359756786
}
}
],
"preprocessing": {
"center": [
24668.5,
54.0,
7.0,
1.0
],
"scale": [
57198.0,
770.5,
6.0,
1.0
]
},
"neighbors": [
{
"features": [
8.710820308402392,
0.0,
0.0,
-0.8148148148148149
],
"choice": [
0,
3,
4
]
},
{
"features": [
0.6701545508584216,
0.9474367293964958,
0.5,
0.0
],
"choice": [
0,
2,
7,
4
]
},
{
"features": [
0.5945575020105598,
-0.03504218040233614,
15.5,
0.0
],
"choice": [
0,
2,
7,
6,
3,
4
]
},
{
"features": [
0.8862285394594217,
0.0,
-0.5,
0.0
],
"choice": [
2,
4
]
},
{
"features": [
-0.2739344033008147,
9.2744970798183,
0.5,
0.0
],
"choice": [
0,
2,
7,
6,
4
]
},
{
"features": [
0.48133676002657433,
-0.058403634003893576,
0.0,
0.0
],
"choice": [
1,
4
]
},
{
"features": [
0.4862145529563971,
0.16353017521090202,
0.5,
0.0
],
"choice": [
0,
1,
4
]
},
{
"features": [
-0.40409629707332423,
-0.06229720960415315,
-0.5,
-1.0
],
"choice": [
4
]
},
{
"features": [
-0.41428896115248787,
1.0408825438027256,
0.3333333333333333,
0.0
],
"choice": [
5,
3,
1,
7,
6,
4
]
},
{
"features": [
0.6317091506696039,
-0.015574302401038288,
-0.6666666666666666,
-1.0
],
"choice": [
1,
0,
3,
4
]
},
{
"features": [
-0.2739344033008147,
2.5256327060350423,
-0.3333333333333333,
0.0
],
"choice": [
0,
5,
3,
7,
4
]
},
{
"features": [
-0.30168012867582783,
0.9682024659312135,
0.0,
0.0
],
"choice": [
1,
3,
4
]
},
{
"features": [
0.2739344033008147,
-0.06229720960415315,
-0.6666666666666666,
0.0
],
"choice": [
4
]
},
{
"features": [
-0.39981293052204625,
0.21025308241401688,
0.5,
0.0
],
"choice": [
7,
4
]
},
{
"features": [
-0.3949351375922235,
-0.04931862426995458,
0.0,
0.0
],
"choice": [
6,
0,
7,
1,
3,
4
]
},
{
"features": [
-0.41797790132522117,
-0.04672290720311486,
-0.5,
0.0
],
"choice": [
6,
1,
7,
2,
0,
3,
4
]
}
],
"configsource": [
"guillermo",
"connect-4",
"Helena",
"Covertype",
"default",
"cnae-9",
"vehicle",
"mfeat-factors"
]
}

View File

@ -1,350 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 4923,
"max_depth": 12,
"min_child_weight": 0.7625732991776795,
"learning_rate": 0.009239549681857523,
"subsample": 0.8193164619615052,
"colsample_bylevel": 0.7785754297307862,
"colsample_bytree": 0.788491073979525,
"reg_alpha": 0.002282749364196872,
"reg_lambda": 131.2194560716441
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 2111,
"max_depth": 9,
"min_child_weight": 3.405822241186395,
"learning_rate": 0.005804247705198151,
"subsample": 0.37848422782052427,
"colsample_bylevel": 0.8228350674288559,
"colsample_bytree": 0.8813475713109656,
"reg_alpha": 0.009761356063132219,
"reg_lambda": 13.187783936727843,
"FLAML_sample_size": 810000
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 1499,
"max_depth": 11,
"min_child_weight": 0.07563529776156448,
"learning_rate": 0.039042609221240955,
"subsample": 0.7832981935783824,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.0009765625,
"reg_lambda": 23.513066752844153
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 19722,
"max_depth": 11,
"min_child_weight": 6.46800727978204,
"learning_rate": 0.0010837437950202355,
"subsample": 0.49509562408032115,
"colsample_bylevel": 1.0,
"colsample_bytree": 0.8826299329274134,
"reg_alpha": 0.23887161121959208,
"reg_lambda": 15.163773888208217
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {
"n_estimators": 544,
"max_depth": 12,
"min_child_weight": 79.32555867011995,
"learning_rate": 0.010128107120014433,
"subsample": 0.9799974977817297,
"colsample_bylevel": 0.881815418056542,
"colsample_bytree": 0.9718556912196423,
"reg_alpha": 72.63148950428749,
"reg_lambda": 1.4601415712058006
}
},
{
"class": "xgb_limitdepth",
"hyperparameters": {}
}
],
"preprocessing": {
"center": [
36691.0,
10.0,
0.0,
1.0
],
"scale": [
140856.0,
1.0,
1.0,
0.4444444444444444
]
},
"neighbors": [
{
"features": [
0.0,
0.0,
0.0,
0.0
],
"choice": [
4,
5
]
},
{
"features": [
-0.17263020389617767,
30.0,
0.0,
0.0
],
"choice": [
2,
0,
5
]
},
{
"features": [
6.129018288180837,
-1.0,
0.0,
-0.7500000000000001
],
"choice": [
1,
0,
2,
4,
5
]
},
{
"features": [
0.48478588061566424,
-1.0,
0.0,
-2.0
],
"choice": [
4,
1,
3,
5
]
},
{
"features": [
-0.14869796103822344,
-1.0,
0.0,
-0.7500000000000001
],
"choice": [
4,
1,
3,
0,
5
]
},
{
"features": [
-0.06175100812176975,
-1.0,
0.0,
-1.7500000000000002
],
"choice": [
4,
1,
5
]
},
{
"features": [
6.129018288180837,
8.0,
0.0,
-1.0
],
"choice": [
0,
2,
1,
4,
5
]
},
{
"features": [
6.129018288180837,
0.0,
0.0,
-2.0250000000000004
],
"choice": [
1,
0,
2,
4,
5
]
},
{
"features": [
0.8713934798659624,
0.0,
0.0,
0.0
],
"choice": [
4,
5
]
},
{
"features": [
0.0,
0.0,
0.0,
0.0
],
"choice": [
1,
3,
0,
2,
5
]
},
{
"features": [
-0.11491168285341058,
6.0,
0.0,
0.0
],
"choice": [
3,
1,
0,
2,
4,
5
]
},
{
"features": [
-0.11491168285341058,
-2.0,
0.0,
0.0
],
"choice": [
0,
1,
3,
2,
4,
5
]
},
{
"features": [
-0.1286065201340376,
-2.0,
0.0,
0.0
],
"choice": [
3,
0,
2,
1,
4,
5
]
},
{
"features": [
0.0,
0.0,
0.0,
-0.6750000000000002
],
"choice": [
2,
3,
1,
0,
5
]
},
{
"features": [
6.288819787584483,
0.0,
0.0,
0.0
],
"choice": [
2,
0,
1,
5
]
},
{
"features": [
-0.16464332367808257,
38.0,
0.0,
0.0
],
"choice": [
0,
2,
3,
1,
5
]
},
{
"features": [
-0.15343329357641847,
-7.0,
0.0,
-1.5000000000000002
],
"choice": [
3,
5
]
}
],
"configsource": [
"higgs",
"bng_pharynx",
"connect-4",
"house_16H",
"bng_echomonths",
"default"
]
}

View File

@ -1,375 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 319,
"max_leaves": 1312,
"min_child_weight": 0.001,
"learning_rate": 0.01872379806270421,
"subsample": 0.6890079660561895,
"colsample_bylevel": 0.7551225121854014,
"colsample_bytree": 0.7860755604500558,
"reg_alpha": 0.17028752704343114,
"reg_lambda": 1.4375743264564231
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 7902,
"max_leaves": 49,
"min_child_weight": 0.038063497848955595,
"learning_rate": 0.0009765625,
"subsample": 0.9357800695141445,
"colsample_bylevel": 0.47031312177249246,
"colsample_bytree": 0.9053386579586192,
"reg_alpha": 1.5286102593845932,
"reg_lambda": 18.96811296717419
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 13499,
"max_leaves": 60,
"min_child_weight": 0.008494221584011285,
"learning_rate": 0.006955765856675575,
"subsample": 0.5965241023754743,
"colsample_bylevel": 0.590641168068946,
"colsample_bytree": 1.0,
"reg_alpha": 0.2522240954379289,
"reg_lambda": 5.351809144038808
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 591,
"max_leaves": 16651,
"min_child_weight": 0.03356567864689129,
"learning_rate": 0.002595066436678338,
"subsample": 0.9114132805513452,
"colsample_bylevel": 0.9503441844594458,
"colsample_bytree": 0.5703338448066768,
"reg_alpha": 0.010405212349127894,
"reg_lambda": 0.05352660657433639
}
}
],
"preprocessing": {
"center": [
18000.0,
28.0,
2.0,
0.7565217391304347
],
"scale": [
42124.0,
130.0,
1.0,
0.5714285714285715
]
},
"neighbors": [
{
"features": [
1.196467571930491,
1.0923076923076922,
0.0,
0.4260869565217391
],
"choice": [
0,
3,
2,
1
]
},
{
"features": [
11.096856898680088,
-0.16153846153846155,
0.0,
-0.5739130434782609
],
"choice": [
0,
2,
3,
1
]
},
{
"features": [
8.658152122305575,
0.38461538461538464,
0.0,
-0.7405797101449274
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
0.27281359794891274,
-0.14615384615384616,
0.0,
-1.3239130434782607
],
"choice": [
3,
0,
2,
1
]
},
{
"features": [
-0.4125676573924604,
-0.1076923076923077,
0.0,
-0.5739130434782609
],
"choice": [
3,
1,
0,
2
]
},
{
"features": [
0.6409647706770487,
1.5538461538461539,
0.0,
0.0
],
"choice": [
1,
0,
2,
3
]
},
{
"features": [
2.3515573069983855,
0.16923076923076924,
0.0,
0.4260869565217391
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
0.6162045389801538,
-0.1076923076923077,
0.0,
-0.5739130434782609
],
"choice": [
1,
0,
2,
3
]
},
{
"features": [
0.5386240622922799,
-0.09230769230769231,
0.0,
-0.5582880434782608
],
"choice": [
0,
1,
3,
2
]
},
{
"features": [
-0.41133320672300827,
-0.18461538461538463,
0.0,
0.4260869565217391
],
"choice": [
2,
1,
0,
3
]
},
{
"features": [
-0.31155635742094767,
12.36923076923077,
0.0,
0.3865087169129372
],
"choice": [
2,
1,
0,
3
]
},
{
"features": [
-0.40594435476213087,
-0.06153846153846154,
0.0,
-0.7114130434782607
],
"choice": [
0,
1,
2,
3
]
},
{
"features": [
0.0,
32.83076923076923,
0.0,
0.4260869565217391
],
"choice": [
0
]
},
{
"features": [
1.6675766783781218,
0.0,
0.0,
0.4260869565217391
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
-0.36356946158959264,
0.8923076923076924,
0.0,
-1.2266908212560386
],
"choice": [
3,
1,
0,
2
]
},
{
"features": [
-0.38225239768303104,
-0.05384615384615385,
0.0,
0.4260869565217391
],
"choice": [
3,
2,
0,
1
]
},
{
"features": [
-0.3590352293229513,
0.06153846153846154,
0.0,
-1.3239130434782607
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
0.3090399772101415,
0.6923076923076923,
0.0,
-0.003997789240972687
],
"choice": [
2,
0,
3,
1
]
},
{
"features": [
-0.3118649700883107,
-0.17692307692307693,
0.0,
0.4260869565217391
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
0.0,
32.83076923076923,
0.0,
0.4260869565217391
],
"choice": [
0,
3
]
},
{
"features": [
-0.3178473079479632,
-0.06153846153846154,
0.0,
0.4260869565217391
],
"choice": [
0,
3,
1,
2
]
}
],
"configsource": [
"fabert",
"bng_lowbwt",
"pol",
"Amazon_employee_access"
]
}

View File

@ -1,512 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 392,
"max_leaves": 46,
"min_child_weight": 0.20655273911443411,
"learning_rate": 0.08039123467849849,
"subsample": 0.6482821473906787,
"colsample_bylevel": 0.5448604029329934,
"colsample_bytree": 0.4211786481671673,
"reg_alpha": 0.029040644754759502,
"reg_lambda": 4.60220206538413
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 6357,
"max_leaves": 206,
"min_child_weight": 1.9495322566288034,
"learning_rate": 0.0068766724195393905,
"subsample": 0.9451618245005704,
"colsample_bylevel": 0.9030482524943064,
"colsample_bytree": 0.9278972006416252,
"reg_alpha": 0.01857648400903689,
"reg_lambda": 6.021166480604588,
"FLAML_sample_size": 344444
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 1067,
"max_leaves": 55,
"min_child_weight": 1.578700876556201,
"learning_rate": 0.01882776721912098,
"subsample": 0.6486829588043383,
"colsample_bylevel": 1.0,
"colsample_bytree": 0.6470978147570122,
"reg_alpha": 0.2623396481373557,
"reg_lambda": 12.320026567378322
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 765,
"max_leaves": 6,
"min_child_weight": 0.001,
"learning_rate": 1.0,
"subsample": 0.9833803894285497,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.0012553728257619922,
"reg_lambda": 0.03280542610559108
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 2866,
"max_leaves": 2954,
"min_child_weight": 0.003652484923138387,
"learning_rate": 0.006320484540131336,
"subsample": 0.45886345839532916,
"colsample_bylevel": 0.4143419565729296,
"colsample_bytree": 0.9117641224108227,
"reg_alpha": 0.2873746517375349,
"reg_lambda": 17.04964039639045
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 512,
"max_leaves": 3194,
"min_child_weight": 0.004561511536080627,
"learning_rate": 0.05288849444758447,
"subsample": 0.8653058105000044,
"colsample_bylevel": 0.8833689901424637,
"colsample_bytree": 0.9505209943737727,
"reg_alpha": 0.0037017878164852017,
"reg_lambda": 2.1872397928745113,
"FLAML_sample_size": 470620
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 335,
"max_leaves": 37,
"min_child_weight": 0.0013851539632487603,
"learning_rate": 0.2593737370075479,
"subsample": 0.9810091528571387,
"colsample_bylevel": 0.9484250613084422,
"colsample_bytree": 0.192606132199437,
"reg_alpha": 0.10585986776049093,
"reg_lambda": 0.017684465384509407
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 8315,
"max_leaves": 4,
"min_child_weight": 0.7673654415794792,
"learning_rate": 0.002432260930606481,
"subsample": 0.8476000618302348,
"colsample_bylevel": 0.8815698870579244,
"colsample_bytree": 0.7057137578225323,
"reg_alpha": 0.0016838090603716895,
"reg_lambda": 0.28815989841009226
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 319,
"max_leaves": 1312,
"min_child_weight": 0.001,
"learning_rate": 0.01872379806270421,
"subsample": 0.6890079660561895,
"colsample_bylevel": 0.7551225121854014,
"colsample_bytree": 0.7860755604500558,
"reg_alpha": 0.17028752704343114,
"reg_lambda": 1.4375743264564231
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 5739,
"max_leaves": 5,
"min_child_weight": 0.1359602026207002,
"learning_rate": 0.14496176867613397,
"subsample": 0.864897070662231,
"colsample_bylevel": 0.01,
"colsample_bytree": 0.9394057513384305,
"reg_alpha": 0.001103317921178771,
"reg_lambda": 0.1655504349283218
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 3369,
"max_leaves": 23,
"min_child_weight": 0.006136645605168392,
"learning_rate": 0.05726537983358939,
"subsample": 1.0,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.40981311572427176,
"reg_lambda": 4.343877111132155
}
}
],
"preprocessing": {
"center": [
24668.5,
54.0,
7.0,
1.0
],
"scale": [
57198.0,
770.5,
6.0,
1.0
]
},
"neighbors": [
{
"features": [
8.710820308402392,
0.0,
0.0,
-0.8148148148148149
],
"choice": [
5,
4,
1,
8,
10,
2,
0,
6,
9,
7,
3
]
},
{
"features": [
0.6701545508584216,
0.9474367293964958,
0.5,
0.0
],
"choice": [
0,
2,
3,
6,
10,
8,
9
]
},
{
"features": [
0.5945575020105598,
-0.03504218040233614,
15.5,
0.0
],
"choice": [
0,
2,
3,
7,
8,
5,
10,
9,
6
]
},
{
"features": [
0.8862285394594217,
0.0,
-0.5,
0.0
],
"choice": [
2,
8,
0,
4,
10,
1,
9,
6,
7,
5,
3
]
},
{
"features": [
-0.2739344033008147,
9.2744970798183,
0.5,
0.0
],
"choice": [
0,
3,
6
]
},
{
"features": [
0.48133676002657433,
-0.058403634003893576,
0.0,
0.0
],
"choice": [
10,
3,
0,
5,
1,
7,
6,
2,
4,
9,
8
]
},
{
"features": [
0.4862145529563971,
0.16353017521090202,
0.5,
0.0
],
"choice": [
1,
0,
2,
3,
10,
8,
6,
5,
9,
7
]
},
{
"features": [
-0.40409629707332423,
-0.06229720960415315,
-0.5,
-1.0
],
"choice": [
3,
9,
5,
10,
1,
7,
2,
8,
4,
6,
0
]
},
{
"features": [
-0.41428896115248787,
1.0408825438027256,
0.3333333333333333,
0.0
],
"choice": [
6,
9,
0,
5,
10,
4,
8,
7,
1,
2,
3
]
},
{
"features": [
0.6317091506696039,
-0.015574302401038288,
-0.6666666666666666,
-1.0
],
"choice": [
1,
10,
4,
5,
8,
6,
2,
0,
3,
9,
7
]
},
{
"features": [
-0.2739344033008147,
2.5256327060350423,
-0.3333333333333333,
0.0
],
"choice": [
0,
2,
3,
9,
6,
10,
5,
8,
7
]
},
{
"features": [
-0.30168012867582783,
0.9682024659312135,
0.0,
0.0
],
"choice": [
8,
4,
0,
2,
10,
1,
5,
6,
9,
7,
3
]
},
{
"features": [
0.2739344033008147,
-0.06229720960415315,
-0.6666666666666666,
0.0
],
"choice": [
10,
3,
9,
1,
4,
2,
8,
5,
0,
7,
6
]
},
{
"features": [
-0.39981293052204625,
0.21025308241401688,
0.5,
0.0
],
"choice": [
0,
9,
1,
7,
5,
10,
6,
2,
4,
8,
3
]
},
{
"features": [
-0.3949351375922235,
-0.04931862426995458,
0.0,
0.0
],
"choice": [
0,
2,
1,
7,
8,
4,
5,
6,
10,
9,
3
]
},
{
"features": [
-0.41797790132522117,
-0.04672290720311486,
-0.5,
0.0
],
"choice": [
7,
4,
8,
2,
0,
5,
10,
1,
6,
9,
3
]
}
],
"configsource": [
"segment",
"Albert",
"Helena",
"car",
"house_8L",
"Covertype",
"cnae-9",
"KDDCup09_appetency",
"fabert",
"dilbert",
"jungle_chess_2pcs_raw_endgame_complete"
]
}

View File

@ -1,311 +0,0 @@
{
"version": "1.0.2",
"meta_feature_names": [
"NumberOfInstances","NumberOfFeatures","NumberOfClasses","PercentageOfNumericFeatures"
],
"portfolio": [
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 6357,
"max_leaves": 206,
"min_child_weight": 1.9495322566288034,
"learning_rate": 0.0068766724195393905,
"subsample": 0.9451618245005704,
"colsample_bylevel": 0.9030482524943064,
"colsample_bytree": 0.9278972006416252,
"reg_alpha": 0.01857648400903689,
"reg_lambda": 6.021166480604588,
"FLAML_sample_size": 344444
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 23045,
"max_leaves": 247,
"min_child_weight": 0.004319397499079841,
"learning_rate": 0.0032914413473281215,
"subsample": 0.7334190564433234,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.03514226467919635,
"reg_lambda": 1.2679661021665851
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 1899,
"max_leaves": 59,
"min_child_weight": 0.013389019900720164,
"learning_rate": 0.0028943401472847964,
"subsample": 0.7808944208233943,
"colsample_bylevel": 1.0,
"colsample_bytree": 0.9999355357362375,
"reg_alpha": 0.7905117773932884,
"reg_lambda": 2.916897119216104
}
},
{
"class": "xgboost",
"hyperparameters": {
"n_estimators": 5611,
"max_leaves": 61,
"min_child_weight": 0.01070518287797225,
"learning_rate": 0.005485127037677848,
"subsample": 0.4713518256961299,
"colsample_bylevel": 0.9777437906530106,
"colsample_bytree": 0.9519335125615331,
"reg_alpha": 0.03621564207188963,
"reg_lambda": 1.8045765669466283
}
}
],
"preprocessing": {
"center": [
36691.0,
10.0,
0.0,
1.0
],
"scale": [
324551.25,
2.5,
1.0,
0.36111111111111116
]
},
"neighbors": [
{
"features": [
0.0,
0.0,
0.0,
0.0
],
"choice": [
2,
3,
0,
1
]
},
{
"features": [
-0.07492191140844474,
12.0,
0.0,
0.0
],
"choice": [
0,
1,
3,
2
]
},
{
"features": [
2.6600082421497375,
-0.4,
0.0,
-0.923076923076923
],
"choice": [
3,
0,
2,
1
]
},
{
"features": [
0.21039820367353385,
-0.4,
0.0,
-2.4615384615384612
],
"choice": [
3,
2,
0,
1
]
},
{
"features": [
-0.06453526215043079,
-0.4,
0.0,
-0.923076923076923
],
"choice": [
2,
3,
0,
1
]
},
{
"features": [
-0.026800081651203008,
-0.4,
0.0,
-2.1538461538461537
],
"choice": [
2,
3,
0,
1
]
},
{
"features": [
2.6600082421497375,
3.2,
0.0,
-1.2307692307692306
],
"choice": [
1,
0,
3,
2
]
},
{
"features": [
2.6600082421497375,
0.0,
0.0,
-2.492307692307692
],
"choice": [
3,
0,
2,
1
]
},
{
"features": [
0.3781868040871819,
0.0,
0.0,
0.0
],
"choice": [
2,
3,
0,
1
]
},
{
"features": [
0.0,
0.0,
0.0,
0.0
],
"choice": [
3,
0,
1,
2
]
},
{
"features": [
-0.04987193856132121,
2.4,
0.0,
0.0
],
"choice": [
3,
1,
0,
2
]
},
{
"features": [
-0.04987193856132121,
-0.8,
0.0,
0.0
],
"choice": [
2,
0,
1,
3
]
},
{
"features": [
-0.0558155299047531,
-0.8,
0.0,
0.0
],
"choice": [
0,
3,
1,
2
]
},
{
"features": [
0.0,
0.0,
0.0,
-0.8307692307692308
],
"choice": [
1,
0,
3,
2
]
},
{
"features": [
2.729362465866331,
0.0,
0.0,
0.0
],
"choice": [
1,
0,
3,
2
]
},
{
"features": [
-0.07145558675247746,
15.2,
0.0,
0.0
],
"choice": [
0,
3,
1,
2
]
}
],
"configsource": [
"Albert",
"mv",
"bng_echomonths",
"house_16H"
]
}

View File

@ -1,9 +0,0 @@
import warnings
from flaml.automl.ml import *
warnings.warn(
"Importing from `flaml.ml` is deprecated. Please use `flaml.automl.ml`.",
DeprecationWarning,
)

View File

@ -1,9 +0,0 @@
import warnings
from flaml.automl.model import *
warnings.warn(
"Importing from `flaml.model` is deprecated. Please use `flaml.automl.model`.",
DeprecationWarning,
)

View File

@ -1,47 +0,0 @@
# ChaCha for Online AutoML
FLAML includes *ChaCha* which is an automatic hyperparameter tuning solution for online machine learning. Online machine learning has the following properties: (1) data comes in sequential order; and (2) the performance of the machine learning model is evaluated online, i.e., at every iteration. *ChaCha* performs online AutoML respecting the aforementioned properties of online learning, and at the same time respecting the following constraints: (1) only a small constant number of 'live' models are allowed to perform online learning at the same time; and (2) no model persistence or offline training is allowed, which means that once we decide to replace a 'live' model with a new one, the replaced model can no longer be retrieved.
For more technical details about *ChaCha*, please check our paper.
* [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.
```
@inproceedings{wu2021chacha,
title={ChaCha for online AutoML},
author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},
year={2021},
booktitle={ICML},
}
```
## `AutoVW`
`flaml.AutoVW` is a realization of *ChaCha* AutoML method with online learners from the open-source online machine learning library [Vowpal Wabbit](https://vowpalwabbit.org/) learner. It can be used to tune both conventional numerical and categorical hyperparameters, such as learning rate, and hyperparameters for featurization choices, such as the namespace (a namespace is a group of features) interactions in Vowpal Wabbit.
An example of online namespace interactions tuning in VW:
```python
# require: pip install flaml[vw]
from flaml import AutoVW
'''create an AutoVW instance for tuning namespace interactions'''
autovw = AutoVW(max_live_model_num=5, search_space={'interactions': AutoVW.AUTOMATIC})
```
An example of online tuning of both namespace interactions and learning rate in VW:
```python
# require: pip install flaml[vw]
from flaml import AutoVW
from flaml.tune import loguniform
''' create an AutoVW instance for tuning namespace interactions and learning rate'''
# set up the search space and init config
search_space_nilr = {'interactions': AutoVW.AUTOMATIC, 'learning_rate': loguniform(lower=2e-10, upper=1.0)}
init_config_nilr = {'interactions': set(), 'learning_rate': 0.5}
# create an AutoVW instance
autovw = AutoVW(max_live_model_num=5, search_space=search_space_nilr, init_config=init_config_nilr)
```
A user can use the resulting AutoVW instances `autovw` in a similar way to a vanilla Vowpal Wabbit instance, i.e., `pyvw.vw`, to perform online learning by iteratively calling its `predict(data_example)` and `learn(data_example)` functions at each data example.
For more examples, please check out
[AutoVW notebook](https://github.com/microsoft/FLAML/blob/main/notebook/autovw.ipynb).

View File

@ -1,2 +0,0 @@
from .trial import VowpalWabbitTrial
from .trial_runner import OnlineTrialRunner

View File

@ -1,214 +0,0 @@
from typing import Optional, Union
import logging
from flaml.tune import (
Trial,
Categorical,
Float,
PolynomialExpansionSet,
polynomial_expansion_set,
)
from flaml.onlineml import OnlineTrialRunner
from flaml.tune.scheduler import ChaChaScheduler
from flaml.tune.searcher import ChampionFrontierSearcher
from flaml.onlineml.trial import get_ns_feature_dim_from_vw_example
logger = logging.getLogger(__name__)
class AutoVW:
"""Class for the AutoVW algorithm."""
WARMSTART_NUM = 100
AUTOMATIC = "_auto"
VW_INTERACTION_ARG_NAME = "interactions"
def __init__(
self,
max_live_model_num: int,
search_space: dict,
init_config: Optional[dict] = {},
min_resource_lease: Optional[Union[str, float]] = "auto",
automl_runner_args: Optional[dict] = {},
scheduler_args: Optional[dict] = {},
model_select_policy: Optional[str] = "threshold_loss_ucb",
metric: Optional[str] = "mae_clipped",
random_seed: Optional[int] = None,
model_selection_mode: Optional[str] = "min",
cb_coef: Optional[float] = None,
):
"""Constructor.
Args:
max_live_model_num: An int to specify the maximum number of
'live' models, which, in other words, is the maximum number
of models allowed to update in each learning iteraction.
search_space: A dictionary of the search space. This search space
includes both hyperparameters we want to tune and fixed
hyperparameters. In the latter case, the value is a fixed value.
init_config: A dictionary of a partial or full initial config,
e.g. {'interactions': set(), 'learning_rate': 0.5}
min_resource_lease: string or float | The minimum resource lease
assigned to a particular model/trial. If set as 'auto', it will
be calculated automatically.
automl_runner_args: A dictionary of configuration for the OnlineTrialRunner.
If set {}, default values will be used, which is equivalent to using
the following configs.
Example:
```python
automl_runner_args = {
"champion_test_policy": 'loss_ucb', # the statistic test for a better champion
"remove_worse": False, # whether to do worse than test
}
```
scheduler_args: A dictionary of configuration for the scheduler.
If set {}, default values will be used, which is equivalent to using the
following config.
Example:
```python
scheduler_args = {
"keep_challenger_metric": 'ucb', # what metric to use when deciding the top performing challengers
"keep_challenger_ratio": 0.5, # denotes the ratio of top performing challengers to keep live
"keep_champion": True, # specifcies whether to keep the champion always running
}
```
model_select_policy: A string in ['threshold_loss_ucb',
'threshold_loss_lcb', 'threshold_loss_avg', 'loss_ucb', 'loss_lcb',
'loss_avg'] to specify how to select one model to do prediction from
the live model pool. Default value is 'threshold_loss_ucb'.
metric: A string in ['mae_clipped', 'mae', 'mse', 'absolute_clipped',
'absolute', 'squared'] to specify the name of the loss function used
for calculating the progressive validation loss in ChaCha.
random_seed: An integer of the random seed used in the searcher
(more specifically this the random seed for ConfigOracle).
model_selection_mode: A string in ['min', 'max'] to specify the objective as
minimization or maximization.
cb_coef: A float coefficient (optional) used in the sample complexity bound.
"""
self._max_live_model_num = max_live_model_num
self._search_space = search_space
self._init_config = init_config
self._online_trial_args = {
"metric": metric,
"min_resource_lease": min_resource_lease,
"cb_coef": cb_coef,
}
self._automl_runner_args = automl_runner_args
self._scheduler_args = scheduler_args
self._model_select_policy = model_select_policy
self._model_selection_mode = model_selection_mode
self._random_seed = random_seed
self._trial_runner = None
self._best_trial = None
# code for debugging purpose
self._prediction_trial_id = None
self._iter = 0
def _setup_trial_runner(self, vw_example):
"""Set up the _trial_runner based on one vw_example."""
# setup the default search space for the namespace interaction hyperparameter
search_space = self._search_space.copy()
for k, v in self._search_space.items():
if k == self.VW_INTERACTION_ARG_NAME and v == self.AUTOMATIC:
raw_namespaces = self.get_ns_feature_dim_from_vw_example(vw_example).keys()
search_space[k] = polynomial_expansion_set(init_monomials=set(raw_namespaces))
# setup the init config based on the input _init_config and search space
init_config = self._init_config.copy()
for k, v in search_space.items():
if k not in init_config.keys():
if isinstance(v, PolynomialExpansionSet):
init_config[k] = set()
elif not isinstance(v, Categorical) and not isinstance(v, Float):
init_config[k] = v
searcher_args = {
"init_config": init_config,
"space": search_space,
"random_seed": self._random_seed,
"online_trial_args": self._online_trial_args,
}
logger.info("original search_space %s", self._search_space)
logger.info("original init_config %s", self._init_config)
logger.info("searcher_args %s", searcher_args)
logger.info("scheduler_args %s", self._scheduler_args)
logger.info("automl_runner_args %s", self._automl_runner_args)
searcher = ChampionFrontierSearcher(**searcher_args)
scheduler = ChaChaScheduler(**self._scheduler_args)
self._trial_runner = OnlineTrialRunner(
max_live_model_num=self._max_live_model_num,
searcher=searcher,
scheduler=scheduler,
**self._automl_runner_args
)
def predict(self, data_sample):
"""Predict on the input data sample.
Args:
data_sample: one data example in vw format.
"""
if self._trial_runner is None:
self._setup_trial_runner(data_sample)
self._best_trial = self._select_best_trial()
self._y_predict = self._best_trial.predict(data_sample)
# code for debugging purpose
if self._prediction_trial_id is None or self._prediction_trial_id != self._best_trial.trial_id:
self._prediction_trial_id = self._best_trial.trial_id
logger.info(
"prediction trial id changed to %s at iter %s, resource used: %s",
self._prediction_trial_id,
self._iter,
self._best_trial.result.resource_used,
)
return self._y_predict
def learn(self, data_sample):
"""Perform one online learning step with the given data sample.
Args:
data_sample: one data example in vw format. It will be used to
update the vw model.
"""
self._iter += 1
self._trial_runner.step(data_sample, (self._y_predict, self._best_trial))
def _select_best_trial(self):
"""Select a best trial from the running trials according to the _model_select_policy."""
best_score = float("+inf") if self._model_selection_mode == "min" else float("-inf")
new_best_trial = None
for trial in self._trial_runner.running_trials:
if trial.result is not None and (
"threshold" not in self._model_select_policy or trial.result.resource_used >= self.WARMSTART_NUM
):
score = trial.result.get_score(self._model_select_policy)
if ("min" == self._model_selection_mode and score < best_score) or (
"max" == self._model_selection_mode and score > best_score
):
best_score = score
new_best_trial = trial
if new_best_trial is not None:
logger.debug("best_trial resource used: %s", new_best_trial.result.resource_used)
return new_best_trial
else:
# This branch will be triggered when the resource consumption all trials are smaller
# than the WARMSTART_NUM threshold. In this case, we will select the _best_trial
# selected in the previous iteration.
if self._best_trial is not None and self._best_trial.status == Trial.RUNNING:
logger.debug("old best trial %s", self._best_trial.trial_id)
return self._best_trial
else:
# this will be triggered in the first iteration or in the iteration where we want
# to select the trial from the previous iteration but that trial has been paused
# (i.e., self._best_trial.status != Trial.RUNNING) by the scheduler.
logger.debug(
"using champion trial: %s",
self._trial_runner.champion_trial.trial_id,
)
return self._trial_runner.champion_trial
@staticmethod
def get_ns_feature_dim_from_vw_example(vw_example) -> dict:
"""Get a dictionary of feature dimensionality for each namespace singleton."""
return get_ns_feature_dim_from_vw_example(vw_example)

View File

@ -1,415 +0,0 @@
import numpy as np
import logging
import time
import math
import copy
import collections
from typing import Optional, Union
from flaml.tune import Trial
try:
from sklearn.metrics import mean_squared_error, mean_absolute_error
except ImportError:
pass
logger = logging.getLogger(__name__)
def get_ns_feature_dim_from_vw_example(vw_example) -> dict:
"""Get a dictionary of feature dimensionality for each namespace singleton."""
# *************************A NOTE about the input vwexample***********
# Assumption: assume the vw_example takes one of the following format
# depending on whether the example includes the feature names.
# format 1: `y |ns1 feature1:feature_value1 feature2:feature_value2 |ns2
# ns2 feature3:feature_value3 feature4:feature_value4`
# format 2: `y | ns1 feature_value1 feature_value2 |
# ns2 feature_value3 feature_value4`
# The output of both cases are `{'ns1': 2, 'ns2': 2}`.
# For more information about the input formate of vw example, please refer to
# https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Input-format.
ns_feature_dim = {}
data = vw_example.split("|")
for i in range(1, len(data)):
if ":" in data[i]:
ns_w_feature = data[i].split(" ")
ns = ns_w_feature[0]
feature = ns_w_feature[1:]
feature_dim = len(feature)
else:
data_split = data[i].split(" ")
ns = data_split[0]
feature_dim = len(data_split) - 1
if len(data_split[-1]) == 0:
feature_dim -= 1
ns_feature_dim[ns] = feature_dim
logger.debug("name space feature dimension %s", ns_feature_dim)
return ns_feature_dim
class OnlineResult:
"""Class for managing the result statistics of a trial."""
prob_delta = 0.1
LOSS_MIN = 0.0
LOSS_MAX = np.inf
CB_COEF = 0.05 # 0.001 for mse
def __init__(
self,
result_type_name: str,
cb_coef: Optional[float] = None,
init_loss: Optional[float] = 0.0,
init_cb: Optional[float] = 100.0,
mode: Optional[str] = "min",
sliding_window_size: Optional[int] = 100,
):
"""Constructor.
Args:
result_type_name: A String to specify the name of the result type.
cb_coef: a string to specify the coefficient on the confidence bound.
init_loss: a float to specify the inital loss.
init_cb: a float to specify the intial confidence bound.
mode: A string in ['min', 'max'] to specify the objective as
minimization or maximization.
sliding_window_size: An int to specify the size of the sliding window
(for experimental purpose).
"""
self._result_type_name = result_type_name # for example 'mse' or 'mae'
self._mode = mode
self._init_loss = init_loss
# statistics needed for alg
self.observation_count = 0
self.resource_used = 0.0
self._loss_avg = 0.0
self._loss_cb = init_cb # a large number (TODO: this can be changed)
self._cb_coef = cb_coef if cb_coef is not None else self.CB_COEF
# optional statistics
self._sliding_window_size = sliding_window_size
self._loss_queue = collections.deque(maxlen=self._sliding_window_size)
def update_result(
self,
new_loss,
new_resource_used,
data_dimension,
bound_of_range=1.0,
new_observation_count=1.0,
):
"""Update result statistics."""
self.resource_used += new_resource_used
# keep the running average instead of sum of loss to avoid over overflow
self._loss_avg = self._loss_avg * (
self.observation_count / (self.observation_count + new_observation_count)
) + new_loss / (self.observation_count + new_observation_count)
self.observation_count += new_observation_count
self._loss_cb = self._update_loss_cb(bound_of_range, data_dimension)
self._loss_queue.append(new_loss)
def _update_loss_cb(self, bound_of_range, data_dim, bound_name="sample_complexity_bound"):
"""Calculate the coefficient of the confidence bound."""
if bound_name == "sample_complexity_bound":
# set the coefficient in the loss bound
if "mae" in self.result_type_name:
coef = self._cb_coef * bound_of_range
else:
coef = 0.001 * bound_of_range
comp_F = math.sqrt(data_dim)
n = self.observation_count
return coef * comp_F * math.sqrt((np.log10(n / OnlineResult.prob_delta)) / n)
else:
raise NotImplementedError
@property
def result_type_name(self):
return self._result_type_name
@property
def loss_avg(self):
return self._loss_avg if self.observation_count != 0 else self._init_loss
@property
def loss_cb(self):
return self._loss_cb
@property
def loss_lcb(self):
return max(self._loss_avg - self._loss_cb, OnlineResult.LOSS_MIN)
@property
def loss_ucb(self):
return min(self._loss_avg + self._loss_cb, OnlineResult.LOSS_MAX)
@property
def loss_avg_recent(self):
return sum(self._loss_queue) / len(self._loss_queue) if len(self._loss_queue) != 0 else self._init_loss
def get_score(self, score_name, cb_ratio=1):
if "lcb" in score_name:
return max(self._loss_avg - cb_ratio * self._loss_cb, OnlineResult.LOSS_MIN)
elif "ucb" in score_name:
return min(self._loss_avg + cb_ratio * self._loss_cb, OnlineResult.LOSS_MAX)
elif "avg" in score_name:
return self._loss_avg
else:
raise NotImplementedError
class BaseOnlineTrial(Trial):
"""Class for the online trial."""
def __init__(
self,
config: dict,
min_resource_lease: float,
is_champion: Optional[bool] = False,
is_checked_under_current_champion: Optional[bool] = True,
custom_trial_name: Optional[str] = "mae",
trial_id: Optional[str] = None,
):
"""Constructor.
Args:
config: The configuration dictionary.
min_resource_lease: A float specifying the minimum resource lease.
is_champion: A bool variable indicating whether the trial is champion.
is_checked_under_current_champion: A bool indicating whether the trial
has been used under the current champion.
custom_trial_name: A string of a custom trial name.
trial_id: A string for the trial id.
"""
# ****basic variables
self.config = config
self.trial_id = trial_id
self.status = Trial.PENDING
self.start_time = time.time()
self.custom_trial_name = custom_trial_name
# ***resource budget related variable
self._min_resource_lease = min_resource_lease
self._resource_lease = copy.copy(self._min_resource_lease)
# ***champion related variables
self._is_champion = is_champion
# self._is_checked_under_current_champion_ is supposed to be always 1 when the trial is first created
self._is_checked_under_current_champion = is_checked_under_current_champion
@property
def is_champion(self):
return self._is_champion
@property
def is_checked_under_current_champion(self):
return self._is_checked_under_current_champion
@property
def resource_lease(self):
return self._resource_lease
def set_checked_under_current_champion(self, checked_under_current_champion: bool):
# This is needed because sometimes
# we want to know whether a trial has been paused since a new champion is promoted.
# We want to try to pause those running trials (even though they are not yet achieve
# the next scheduling check point according to resource used and resource lease),
# because a better trial is likely to be in the new challengers generated by the new
# champion, so we want to try them as soon as possible.
# If we wait until we reach the next scheduling point, we may waste a lot of resource
# (depending on what is the current resource lease) on the old trials (note that new
# trials is not possible to be scheduled to run until there is a slot openning).
# Intuitively speaking, we want to squize an opening slot as soon as possible once
# a new champion is promoted, such that we are able to try newly generated challengers.
self._is_checked_under_current_champion = checked_under_current_champion
def set_resource_lease(self, resource: float):
"""Sets the resource lease accordingly."""
self._resource_lease = resource
def set_status(self, status):
"""Sets the status of the trial and record the start time."""
self.status = status
if status == Trial.RUNNING:
if self.start_time is None:
self.start_time = time.time()
class VowpalWabbitTrial(BaseOnlineTrial):
"""The class for Vowpal Wabbit online trials."""
# NOTE: 1. About namespaces in vw:
# - Wiki in vw:
# https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Namespaces
# - Namespace vs features:
# https://stackoverflow.com/questions/28586225/in-vowpal-wabbit-what-is-the-difference-between-a-namespace-and-feature
# About result:
# 1. training related results (need to be updated in the trainable class)
# 2. result about resources lease (need to be updated externally)
cost_unit = 1.0
interactions_config_key = "interactions"
MIN_RES_CONST = 5
def __init__(
self,
config: dict,
min_resource_lease: float,
metric: str = "mae",
is_champion: Optional[bool] = False,
is_checked_under_current_champion: Optional[bool] = True,
custom_trial_name: Optional[str] = "vw_mae_clipped",
trial_id: Optional[str] = None,
cb_coef: Optional[float] = None,
):
"""Constructor.
Args:
config (dict): the config of the trial (note that the config is a set
because the hyperparameters are).
min_resource_lease (float): the minimum resource lease.
metric (str): the loss metric.
is_champion (bool): indicates whether the trial is the current champion or not.
is_checked_under_current_champion (bool): indicates whether this trials has
been paused under the current champion.
trial_id (str): id of the trial (if None, it will be generated in the constructor).
"""
try:
from vowpalwabbit import pyvw
except ImportError:
raise ImportError("To use AutoVW, please run pip install flaml[vw] to install vowpalwabbit")
# attributes
self.trial_id = self._config_to_id(config) if trial_id is None else trial_id
logger.info("Create trial with trial_id: %s", self.trial_id)
super().__init__(
config,
min_resource_lease,
is_champion,
is_checked_under_current_champion,
custom_trial_name,
self.trial_id,
)
self.model = None # model is None until the config is scheduled to run
self.result = None
self.trainable_class = pyvw.vw
# variables that are needed during online training
self._metric = metric
self._y_min_observed = None
self._y_max_observed = None
# application dependent variables
self._dim = None
self._cb_coef = cb_coef
@staticmethod
def _config_to_id(config):
"""Generate an id for the provided config."""
# sort config keys
sorted_k_list = sorted(list(config.keys()))
config_id_full = ""
for key in sorted_k_list:
v = config[key]
config_id = "|"
if isinstance(v, set):
value_list = sorted(v)
config_id += "_".join([str(k) for k in value_list])
else:
config_id += str(v)
config_id_full = config_id_full + config_id
return config_id_full
def _initialize_vw_model(self, vw_example):
"""Initialize a vw model using the trainable_class"""
self._vw_config = self.config.copy()
ns_interactions = self.config.get(VowpalWabbitTrial.interactions_config_key, None)
# ensure the feature interaction config is a list (required by VW)
if ns_interactions is not None:
self._vw_config[VowpalWabbitTrial.interactions_config_key] = list(ns_interactions)
# get the dimensionality of the feature according to the namespace configuration
namespace_feature_dim = get_ns_feature_dim_from_vw_example(vw_example)
self._dim = self._get_dim_from_ns(namespace_feature_dim, ns_interactions)
# construct an instance of vw model using the input config and fixed config
self.model = self.trainable_class(**self._vw_config)
self.result = OnlineResult(
self._metric,
cb_coef=self._cb_coef,
init_loss=0.0,
init_cb=100.0,
)
def train_eval_model_online(self, data_sample, y_pred):
"""Train and evaluate model online."""
# extract info needed the first time we see the data
if self._resource_lease == "auto" or self._resource_lease is None:
assert self._dim is not None
self._resource_lease = self._dim * self.MIN_RES_CONST
y = self._get_y_from_vw_example(data_sample)
self._update_y_range(y)
if self.model is None:
# initialize self.model and self.result
self._initialize_vw_model(data_sample)
# do one step of learning
self.model.learn(data_sample)
# update training related results accordingly
new_loss = self._get_loss(y, y_pred, self._metric, self._y_min_observed, self._y_max_observed)
# udpate sample size, sum of loss, and cost
data_sample_size = 1
bound_of_range = self._y_max_observed - self._y_min_observed
if bound_of_range == 0:
bound_of_range = 1.0
self.result.update_result(
new_loss,
VowpalWabbitTrial.cost_unit * data_sample_size,
self._dim,
bound_of_range,
)
def predict(self, x):
"""Predict using the model."""
if self.model is None:
# initialize self.model and self.result
self._initialize_vw_model(x)
return self.model.predict(x)
def _get_loss(self, y_true, y_pred, loss_func_name, y_min_observed, y_max_observed):
"""Get instantaneous loss from y_true and y_pred, and loss_func_name
For mae_clip, we clip y_pred in the observed range of y
"""
if "mse" in loss_func_name or "squared" in loss_func_name:
loss_func = mean_squared_error
elif "mae" in loss_func_name or "absolute" in loss_func_name:
loss_func = mean_absolute_error
if y_min_observed is not None and y_max_observed is not None and "clip" in loss_func_name:
# clip y_pred in the observed range of y
y_pred = min(y_max_observed, max(y_pred, y_min_observed))
else:
raise NotImplementedError
return loss_func([y_true], [y_pred])
def _update_y_range(self, y):
"""Maintain running observed minimum and maximum target value."""
if self._y_min_observed is None or y < self._y_min_observed:
self._y_min_observed = y
if self._y_max_observed is None or y > self._y_max_observed:
self._y_max_observed = y
@staticmethod
def _get_dim_from_ns(namespace_feature_dim: dict, namespace_interactions: Union[set, list]):
"""Get the dimensionality of the corresponding feature of input namespace set."""
total_dim = sum(namespace_feature_dim.values())
if namespace_interactions:
for f in namespace_interactions:
ns_dim = 1.0
for c in f:
ns_dim *= namespace_feature_dim[c]
total_dim += ns_dim
return total_dim
def clean_up_model(self):
self.model = None
self.result = None
@staticmethod
def _get_y_from_vw_example(vw_example):
"""Get y from a vw_example. this works for regression datasets."""
return float(vw_example.split("|")[0])

View File

@ -1,534 +0,0 @@
import numpy as np
import math
from flaml.tune import Trial
from flaml.tune.scheduler import TrialScheduler
import logging
logger = logging.getLogger(__name__)
class OnlineTrialRunner:
"""Class for the OnlineTrialRunner."""
# ************NOTE about the status of a trial***************
# Trial.PENDING: All trials are set to be pending when frist added into the OnlineTrialRunner until
# it is selected to run. By this definition, a trial with status Trial.PENDING is a challenger
# trial added to the OnlineTrialRunner but never been selected to run.
# It denotes the starting of trial's lifespan in the OnlineTrialRunner.
# Trial.RUNNING: It indicates that this trial is one of the concurrently running trials.
# The max number of Trial.RUNNING trials is running_budget.
# The status of a trial will be set to Trial.RUNNING the next time it selected to run.
# A trial's status may have the following change:
# Trial.PENDING -> Trial.RUNNING
# Trial.PAUSED - > Trial.RUNNING
# Trial.PAUSED: The status of a trial is set to Trial.PAUSED once it is removed from the running trials.
# Trial.RUNNING - > Trial.PAUSED
# Trial.TERMINATED: set the status of a trial to Trial.TERMINATED when you never want to select it.
# It denotes the real end of a trial's lifespan.
# Status change routine of a trial:
# Trial.PENDING -> (Trial.RUNNING -> Trial.PAUSED -> Trial.RUNNING -> ...) -> Trial.TERMINATED(optional)
RANDOM_SEED = 123456
WARMSTART_NUM = 100
def __init__(
self, max_live_model_num: int, searcher=None, scheduler=None, champion_test_policy="loss_ucb", **kwargs
):
"""Constructor.
Args:
max_live_model_num: The maximum number of 'live'/running models allowed.
searcher: A class for generating Trial objects progressively.
The ConfigOracle is implemented in the searcher.
scheduler: A class for managing the 'live' trials and allocating the
resources for the trials.
champion_test_policy: A string to specify what test policy to test for
champion. Currently can choose from ['loss_ucb', 'loss_avg', 'loss_lcb', None].
"""
# ************A NOTE about the input searcher and scheduler******
# Required methods of the searcher:
# - next_trial()
# Generate the next trial to add.
# - set_search_properties(metric: Optional[str], mode: Optional[str],
# config: Optional[dict], setting: Optional[dict])
# Generate new challengers based on the current champion and update the challenger list
# - on_trial_result(trial_id: str, result: Dict)
# Reprot results to the scheduler.
# Required methods of the scheduler:
# - on_trial_add(trial_runner, trial: Trial)
# It adds candidate trials to the scheduler. It is called inside of the add_trial
# function in the TrialRunner.
# - on_trial_remove(trial_runner, trial: Trial)
# Remove terminated trials from the scheduler.
# - on_trial_result(trial_runner, trial: Trial, result: Dict)
# Reprot results to the scheduler.
# - choose_trial_to_run(trial_runner) -> Optional[Trial]
# Among them, on_trial_result and choose_trial_to_run are the most important methods
# *****************************************************************
# OnlineTrialRunner setting
self._searcher = searcher
self._scheduler = scheduler
self._champion_test_policy = champion_test_policy
self._max_live_model_num = max_live_model_num
self._remove_worse = kwargs.get("remove_worse", True)
self._bound_trial_num = kwargs.get("bound_trial_num", False)
self._no_model_persistence = True
# stores all the trials added to the OnlineTrialRunner
# i.e., include the champion and all the challengers
self._trials = []
self._champion_trial = None
self._best_challenger_trial = None
self._first_challenger_pool_size = None
self._random_state = np.random.RandomState(self.RANDOM_SEED)
self._running_trials = set()
# initially schedule up to max_live_model_num of live models and
# set the first trial as the champion (which is done inside self.step())
self._total_steps = 0
logger.info("init step %s", self._max_live_model_num)
# TODO: add more comments
self.step()
assert self._champion_trial is not None
@property
def champion_trial(self) -> Trial:
"""The champion trial."""
return self._champion_trial
@property
def running_trials(self):
"""The running/'live' trials."""
return self._running_trials
def step(self, data_sample=None, prediction_trial_tuple=None):
"""Schedule one trial to run each time it is called.
Args:
data_sample: One data example.
prediction_trial_tuple: A list of information containing
(prediction_made, prediction_trial).
"""
# TODO: Will remove prediction_trial_tuple.
# NOTE: This function consists of the following several parts:
# * Update model:
# 0. Update running trials using observations received.
# * Tests for Champion:
# 1. Test for champion (BetterThan test, and WorseThan test)
# 1.1 BetterThan test
# 1.2 WorseThan test: a trial may be removed if WroseThan test is triggered
# * Online Scheduling:
# 2. Report results to the searcher and scheduler (the scheduler will return a decision about
# the status of the running trials).
# 3. Pause or stop a trial according to the scheduler's decision.
# Add a trial into the OnlineTrialRunner if there are opening slots.
# ***********Update running trials with observation*******************
if data_sample is not None:
self._total_steps += 1
prediction_made, prediction_trial = (
prediction_trial_tuple[0],
prediction_trial_tuple[1],
)
# assert prediction_trial.status == Trial.RUNNING
trials_to_pause = []
for trial in list(self._running_trials):
if trial != prediction_trial:
y_predicted = trial.predict(data_sample)
else:
y_predicted = prediction_made
trial.train_eval_model_online(data_sample, y_predicted)
logger.debug(
"running trial at iter %s %s %s %s %s %s",
self._total_steps,
trial.trial_id,
trial.result.loss_avg,
trial.result.loss_cb,
trial.result.resource_used,
trial.resource_lease,
)
# report result to the searcher
self._searcher.on_trial_result(trial.trial_id, trial.result)
# report result to the scheduler and the scheduler makes a decision about
# the running status of the trial
decision = self._scheduler.on_trial_result(self, trial, trial.result)
# set the status of the trial according to the decision made by the scheduler
logger.debug(
"trial decision %s %s at step %s",
decision,
trial.trial_id,
self._total_steps,
)
if decision == TrialScheduler.STOP:
self.stop_trial(trial)
elif decision == TrialScheduler.PAUSE:
trials_to_pause.append(trial)
else:
self.run_trial(trial)
# ***********Statistical test of champion*************************************
self._champion_test()
# Pause the trial after the tests because the tests involves the reset of the trial's result
for trial in trials_to_pause:
self.pause_trial(trial)
# ***********Add and schedule new trials to run if there are opening slots****
# Add trial if needed: add challengers into consideration through _add_trial_from_searcher()
# if there are available slots
for _ in range(self._max_live_model_num - len(self._running_trials)):
self._add_trial_from_searcher()
# Scheduling: schedule up to max_live_model_num number of trials to run
# (set the status as Trial.RUNNING)
while self._max_live_model_num > len(self._running_trials):
trial_to_run = self._scheduler.choose_trial_to_run(self)
if trial_to_run is not None:
self.run_trial(trial_to_run)
else:
break
def get_top_running_trials(self, top_ratio=None, top_metric="ucb") -> list:
"""Get a list of trial ids, whose performance is among the top running trials."""
running_valid_trials = [trial for trial in self._running_trials if trial.result is not None]
if not running_valid_trials:
return
if top_ratio is None:
top_number = 0
elif isinstance(top_ratio, float):
top_number = math.ceil(len(running_valid_trials) * top_ratio)
elif isinstance(top_ratio, str) and "best" in top_ratio:
top_number = 1
else:
raise NotImplementedError
if "ucb" in top_metric:
test_attribute = "loss_ucb"
elif "avg" in top_metric:
test_attribute = "loss_avg"
elif "lcb" in top_metric:
test_attribute = "loss_lcb"
else:
raise NotImplementedError
top_running_valid_trials = []
logger.info("Running trial ids %s", [trial.trial_id for trial in running_valid_trials])
self._random_state.shuffle(running_valid_trials)
results = [trial.result.get_score(test_attribute) for trial in running_valid_trials]
# sorted result (small to large) index
sorted_index = np.argsort(np.array(results))
for i in range(min(top_number, len(running_valid_trials))):
top_running_valid_trials.append(running_valid_trials[sorted_index[i]])
logger.info("Top running ids %s", [trial.trial_id for trial in top_running_valid_trials])
return top_running_valid_trials
def _add_trial_from_searcher(self):
"""Add a new trial to this TrialRunner.
NOTE:
The new trial is acquired from the input search algorithm, i.e. self._searcher.
A 'new' trial means the trial is not in self._trial.
"""
# (optionally) upper bound the number of trials in the OnlineTrialRunner
if self._bound_trial_num and self._first_challenger_pool_size is not None:
active_trial_size = len([t for t in self._trials if t.status != Trial.TERMINATED])
trial_num_upper_bound = (
int(round((np.log10(self._total_steps) + 1) * self._first_challenger_pool_size))
if self._first_challenger_pool_size
else np.inf
)
if active_trial_size > trial_num_upper_bound:
logger.info(
"Not adding new trials: %s exceeds trial limit %s.",
active_trial_size,
trial_num_upper_bound,
)
return None
# output one trial from the trial pool (new challenger pool) maintained in the searcher
# Assumption on the searcher: when all frontiers (i.e., all the challengers generated
# based on the current champion) of the current champion are added, calling next_trial()
# will return None
trial = self._searcher.next_trial()
if trial is not None:
self.add_trial(trial) # dup checked in add_trial
# the champion_trial is initially None, so we need to set it up the first time
# a valid trial is added.
# Assumption on self._searcher: the first trial generated is the champion trial
if self._champion_trial is None:
logger.info("Initial set up of the champion trial %s", trial.config)
self._set_champion(trial)
else:
self._all_new_challengers_added = True
if self._first_challenger_pool_size is None:
self._first_challenger_pool_size = len(self._trials)
def _champion_test(self):
"""Perform tests again the latest champion, including bette_than tests and worse_than tests"""
# for BetterThan test, we only need to compare the best challenger with the champion
self._get_best_challenger()
if self._best_challenger_trial is not None:
assert self._best_challenger_trial.trial_id != self._champion_trial.trial_id
# test whether a new champion is found and set the trial properties accordingly
is_new_champion_found = self._better_than_champion_test(self._best_challenger_trial)
if is_new_champion_found:
self._set_champion(new_champion_trial=self._best_challenger_trial)
# performs _worse_than_champion_test, which is an optional component in ChaCha
if self._remove_worse:
to_stop = []
for trial_to_test in self._trials:
if trial_to_test.status != Trial.TERMINATED:
worse_than_champion = self._worse_than_champion_test(
self._champion_trial, trial_to_test, self.WARMSTART_NUM
)
if worse_than_champion:
to_stop.append(trial_to_test)
# we want to ensure there are at least #max_live_model_num of challengers remaining
max_to_stop_num = len([t for t in self._trials if t.status != Trial.TERMINATED]) - self._max_live_model_num
for i in range(min(max_to_stop_num, len(to_stop))):
self.stop_trial(to_stop[i])
def _get_best_challenger(self):
"""Get the 'best' (in terms of the champion_test_policy) challenger under consideration."""
if self._champion_test_policy is None:
return
if "ucb" in self._champion_test_policy:
test_attribute = "loss_ucb"
elif "avg" in self._champion_test_policy:
test_attribute = "loss_avg"
else:
raise NotImplementedError
active_trials = [
trial
for trial in self._trials
if (
trial.status != Trial.TERMINATED
and trial.trial_id != self._champion_trial.trial_id
and trial.result is not None
)
]
if active_trials:
self._random_state.shuffle(active_trials)
results = [trial.result.get_score(test_attribute) for trial in active_trials]
best_index = np.argmin(results)
self._best_challenger_trial = active_trials[best_index]
def _set_champion(self, new_champion_trial):
"""Set the status of the existing trials once a new champion is found."""
assert new_champion_trial is not None
is_init_update = False
if self._champion_trial is None:
is_init_update = True
self.run_trial(new_champion_trial)
# set the checked_under_current_champion status of the trials
for trial in self._trials:
if trial.trial_id == new_champion_trial.trial_id:
trial.set_checked_under_current_champion(True)
else:
trial.set_checked_under_current_champion(False)
self._champion_trial = new_champion_trial
self._all_new_challengers_added = False
logger.info("Set the champion as %s", self._champion_trial.trial_id)
if not is_init_update:
self._champion_update_times += 1
# calling set_search_properties of searcher will trigger
# new challenger generation. we do not do this for init champion
# as this step is already done when first constructing the searcher
self._searcher.set_search_properties(setting={self._searcher.CHAMPION_TRIAL_NAME: self._champion_trial})
else:
self._champion_update_times = 0
def get_trials(self) -> list:
"""Return the list of trials managed by this TrialRunner."""
return self._trials
def add_trial(self, new_trial):
"""Add a new trial to this TrialRunner.
Trials may be added at any time.
Args:
new_trial (Trial): Trial to queue.
"""
# Only add the new trial when it does not exist (according to the trial_id, which is
# the signature of the trail) in self._trials.
for trial in self._trials:
if trial.trial_id == new_trial.trial_id:
trial.set_checked_under_current_champion(True)
return
logger.info(
"adding trial at iter %s, %s %s",
self._total_steps,
new_trial.trial_id,
len(self._trials),
)
self._trials.append(new_trial)
self._scheduler.on_trial_add(self, new_trial)
def stop_trial(self, trial):
"""Stop a trial: set the status of a trial to be
Trial.TERMINATED and perform other subsequent operations.
"""
if trial.status in [Trial.ERROR, Trial.TERMINATED]:
return
else:
logger.info(
"Terminating trial %s, with trial result %s",
trial.trial_id,
trial.result,
)
trial.set_status(Trial.TERMINATED)
# clean up model and result
trial.clean_up_model()
self._scheduler.on_trial_remove(self, trial)
self._searcher.on_trial_complete(trial.trial_id)
self._running_trials.remove(trial)
def pause_trial(self, trial):
"""Pause a trial: set the status of a trial to be Trial.PAUSED
and perform other subsequent operations.
"""
if trial.status in [Trial.ERROR, Trial.TERMINATED]:
return
else:
logger.info(
"Pausing trial %s, with trial loss_avg: %s, loss_cb: %s, loss_ucb: %s,\
resource_lease: %s",
trial.trial_id,
trial.result.loss_avg,
trial.result.loss_cb,
trial.result.loss_avg + trial.result.loss_cb,
trial.resource_lease,
)
trial.set_status(Trial.PAUSED)
# clean up model and result if no model persistence
if self._no_model_persistence:
trial.clean_up_model()
self._running_trials.remove(trial)
def run_trial(self, trial):
"""Run a trial: set the status of a trial to be Trial.RUNNING
and perform other subsequent operations.
"""
if trial.status in [Trial.ERROR, Trial.TERMINATED]:
return
else:
trial.set_status(Trial.RUNNING)
self._running_trials.add(trial)
def _better_than_champion_test(self, trial_to_test):
"""Test whether there is a config in the existing trials that
is better than the current champion config.
Returns:
A bool indicating whether a new champion is found.
"""
if trial_to_test.result is not None and self._champion_trial.result is not None:
if "ucb" in self._champion_test_policy:
return self._test_lcb_ucb(self._champion_trial, trial_to_test, self.WARMSTART_NUM)
elif "avg" in self._champion_test_policy:
return self._test_avg_loss(self._champion_trial, trial_to_test, self.WARMSTART_NUM)
elif "martingale" in self._champion_test_policy:
return self._test_martingale(self._champion_trial, trial_to_test)
else:
raise NotImplementedError
else:
return False
@staticmethod
def _worse_than_champion_test(champion_trial, trial, warmstart_num=1) -> bool:
"""Test whether the input trial is worse than the champion_trial"""
if trial.result is not None and trial.result.resource_used >= warmstart_num:
if trial.result.loss_lcb > champion_trial.result.loss_ucb:
logger.info(
"=========trial %s is worse than champion %s=====",
trial.trial_id,
champion_trial.trial_id,
)
logger.info("trial %s %s %s", trial.config, trial.result, trial.resource_lease)
logger.info(
"trial loss_avg:%s, trial loss_cb %s",
trial.result.loss_avg,
trial.result.loss_cb,
)
logger.info(
"champion loss_avg:%s, champion loss_cb %s",
champion_trial.result.loss_avg,
champion_trial.result.loss_cb,
)
logger.info("champion %s", champion_trial.config)
logger.info(
"trial loss_avg_recent:%s, trial loss_cb %s",
trial.result.loss_avg_recent,
trial.result.loss_cb,
)
logger.info(
"champion loss_avg_recent:%s, champion loss_cb %s",
champion_trial.result.loss_avg_recent,
champion_trial.result.loss_cb,
)
return True
return False
@staticmethod
def _test_lcb_ucb(champion_trial, trial, warmstart_num=1) -> bool:
"""Comare the challenger(i.e., trial)'s loss upper bound with
champion_trial's loss lower bound - cb
"""
assert trial.trial_id != champion_trial.trial_id
if trial.result.resource_used >= warmstart_num:
if trial.result.loss_ucb < champion_trial.result.loss_lcb - champion_trial.result.loss_cb:
logger.info("======new champion condition satisfied: using lcb vs ucb=====")
logger.info(
"new champion trial %s %s %s",
trial.trial_id,
trial.result.resource_used,
trial.resource_lease,
)
logger.info(
"new champion trial loss_avg:%s, trial loss_cb %s",
trial.result.loss_avg,
trial.result.loss_cb,
)
logger.info(
"old champion trial %s %s %s",
champion_trial.trial_id,
champion_trial.result.resource_used,
champion_trial.resource_lease,
)
logger.info(
"old champion loss avg %s, loss cb %s",
champion_trial.result.loss_avg,
champion_trial.result.loss_cb,
)
return True
return False
@staticmethod
def _test_avg_loss(champion_trial, trial, warmstart_num=1) -> bool:
"""Comare the challenger(i.e., trial)'s average loss with the
champion_trial's average loss
"""
assert trial.trial_id != champion_trial.trial_id
if trial.result.resource_used >= warmstart_num:
if trial.result.loss_avg < champion_trial.result.loss_avg:
logger.info("=====new champion condition satisfied using avg loss=====")
logger.info("trial %s", trial.config)
logger.info(
"trial loss_avg:%s, trial loss_cb %s",
trial.result.loss_avg,
trial.result.loss_cb,
)
logger.info(
"champion loss_avg:%s, champion loss_cb %s",
champion_trial.result.loss_avg,
champion_trial.result.loss_cb,
)
logger.info("champion %s", champion_trial.config)
return True
return False
@staticmethod
def _test_martingale(champion_trial, trial):
"""Comare the challenger and champion using confidence sequence based
test martingale
Not implementated yet
"""
NotImplementedError

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,231 +0,0 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved. \n",
"\n",
"Licensed under the MIT License.\n",
"\n",
"# Run FLAML in AzureML\n",
"\n",
"\n",
"## 1. Introduction\n",
"\n",
"FLAML is a Python library (https://github.com/microsoft/FLAML) designed to automatically produce accurate machine learning models \n",
"with low computational cost. It is fast and economical. The simple and lightweight design makes it easy \n",
"to use and extend, such as adding new learners. FLAML can \n",
"- serve as an economical AutoML engine,\n",
"- be used as a fast hyperparameter tuning tool, or \n",
"- be embedded in self-tuning software that requires low latency & resource in repetitive\n",
" tuning tasks.\n",
"\n",
"In this notebook, we use one real data example (binary classification) to showcase how to use FLAML library together with AzureML.\n",
"\n",
"FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [automl,azureml] option:\n",
"```bash\n",
"pip install flaml[automl,azureml]\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install flaml[automl,azureml]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Enable mlflow in AzureML workspace"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import mlflow\n",
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## 2. Classification Example\n",
"### Load data and preprocess\n",
"\n",
"Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "subslide"
},
"tags": []
},
"outputs": [],
"source": [
"from flaml.data import load_openml_dataset\n",
"X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Run FLAML\n",
"In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default ML learners of FLAML are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"''' import AutoML class from flaml package '''\n",
"from flaml import AutoML\n",
"automl = AutoML()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"settings = {\n",
" \"time_budget\": 60, # total running time in seconds\n",
" \"metric\": 'accuracy', \n",
" # check the documentation for options of metrics (https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#optimization-metric)\n",
" \"estimator_list\": ['lgbm', 'rf', 'xgboost'], # list of ML learners\n",
" \"task\": 'classification', # task type \n",
" \"sample\": False, # whether to subsample training data\n",
" \"log_file_name\": 'airlines_experiment.log', # flaml log file\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"experiment = mlflow.set_experiment(\"flaml\")\n",
"with mlflow.start_run() as run:\n",
" automl.fit(X_train=X_train, y_train=y_train, **settings)\n",
" # log the model\n",
" mlflow.sklearn.log_model(automl, \"automl\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"automl = mlflow.sklearn.load_model(f\"{run.info.artifact_uri}/automl\")\n",
"print(automl.predict_proba(X_test))\n",
"print(automl.predict(X_test))"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Retrieve logs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "subslide"
},
"tags": []
},
"outputs": [],
"source": [
"mlflow.search_runs(experiment_ids=[experiment.experiment_id], filter_string=\"params.learner = 'xgboost'\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.13 ('syml-py38')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
"vscode": {
"interpreter": {
"hash": "e3d9487e2ef008ade0db1bc293d3206d35cb2b6081faff9f66b40b257b7398f7"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,534 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) 2021. All rights reserved.\n",
"\n",
"Contributed by: @bnriiitb\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Using AutoML in Sklearn Pipeline\n",
"\n",
"This tutorial will help you understand how FLAML's AutoML can be used as a transformer in the Sklearn pipeline."
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## 1.Introduction\n",
"\n",
"### 1.1 FLAML - Fast and Lightweight AutoML\n",
"\n",
"FLAML is a Python library (https://github.com/microsoft/FLAML) designed to automatically produce accurate machine learning models with low computational cost. It is fast and economical. The simple and lightweight design makes it easy to use and extend, such as adding new learners. \n",
"\n",
"FLAML can \n",
"- serve as an economical AutoML engine,\n",
"- be used as a fast hyperparameter tuning tool, or \n",
"- be embedded in self-tuning software that requires low latency & resource in repetitive\n",
" tuning tasks.\n",
"\n",
"In this notebook, we use one real data example (binary classification) to showcase how to use FLAML library.\n",
"\n",
"FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the `[automl]` option (this option is introduced from version 2, for version 1 it is installed by default):\n",
"```bash\n",
"pip install flaml[automl]\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"%pip install flaml[automl] openml"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2 Why are pipelines a silver bullet?\n",
"\n",
"In a typical machine learning workflow we have to apply all the transformations at least twice. \n",
"1. During Training\n",
"2. During Inference\n",
"\n",
"Scikit-learn pipelines provide an easy to use inteface to automate ML workflows by allowing several transformers to be chained together. \n",
"\n",
"The key benefits of using pipelines:\n",
"* Make ML workflows highly readable, enabling fast development and easy review\n",
"* Help to build sequential and parallel processes\n",
"* Allow hyperparameter tuning across the estimators\n",
"* Easier to share and collaborate with multiple users (bug fixes, enhancements etc)\n",
"* Enforce the implementation and order of steps"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### As FLAML's AutoML module can be used a transformer in the Sklearn's pipeline we can get all the benefits of pipeline and thereby write extremley clean, and resuable code."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Classification Example\n",
"### Load data and preprocess\n",
"\n",
"Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"download dataset from openml\n",
"Dataset name: airlines\n",
"X_train.shape: (404537, 7), y_train.shape: (404537,);\n",
"X_test.shape: (134846, 7), y_test.shape: (134846,)\n"
]
}
],
"source": [
"from flaml.data import load_openml_dataset\n",
"X_train, X_test, y_train, y_test = load_openml_dataset(\n",
" dataset_id=1169, data_dir='./', random_state=1234, dataset_format='array')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 12., 2648., 4., 15., 4., 450., 67.], dtype=float32)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Create a Pipeline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;imputuer&#x27;, SimpleImputer()),\n",
" (&#x27;standardizer&#x27;, StandardScaler()),\n",
" (&#x27;automl&#x27;,\n",
" AutoML(append_log=False, auto_augment=True, custom_hp={},\n",
" early_stop=False, ensemble=False, estimator_list=&#x27;auto&#x27;,\n",
" eval_method=&#x27;auto&#x27;, fit_kwargs_by_estimator={},\n",
" hpo_method=&#x27;auto&#x27;, keep_search_state=False,\n",
" learner_selector=&#x27;sample&#x27;, log_file_name=&#x27;&#x27;,\n",
" log_training_metric=False, log_type=&#x27;better&#x27;,\n",
" max_iter=None, mem_thres=4294967296, metric=&#x27;auto&#x27;,\n",
" metric_constraints=[], min_sample_size=10000,\n",
" model_history=False, n_concurrent_trials=1, n_jobs=-1,\n",
" n_splits=5, pred_time_limit=inf, retrain_full=True,\n",
" sample=True, split_ratio=0.1, split_type=&#x27;auto&#x27;,\n",
" starting_points=&#x27;static&#x27;, task=&#x27;classification&#x27;, ...))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[(&#x27;imputuer&#x27;, SimpleImputer()),\n",
" (&#x27;standardizer&#x27;, StandardScaler()),\n",
" (&#x27;automl&#x27;,\n",
" AutoML(append_log=False, auto_augment=True, custom_hp={},\n",
" early_stop=False, ensemble=False, estimator_list=&#x27;auto&#x27;,\n",
" eval_method=&#x27;auto&#x27;, fit_kwargs_by_estimator={},\n",
" hpo_method=&#x27;auto&#x27;, keep_search_state=False,\n",
" learner_selector=&#x27;sample&#x27;, log_file_name=&#x27;&#x27;,\n",
" log_training_metric=False, log_type=&#x27;better&#x27;,\n",
" max_iter=None, mem_thres=4294967296, metric=&#x27;auto&#x27;,\n",
" metric_constraints=[], min_sample_size=10000,\n",
" model_history=False, n_concurrent_trials=1, n_jobs=-1,\n",
" n_splits=5, pred_time_limit=inf, retrain_full=True,\n",
" sample=True, split_ratio=0.1, split_type=&#x27;auto&#x27;,\n",
" starting_points=&#x27;static&#x27;, task=&#x27;classification&#x27;, ...))])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SimpleImputer</label><div class=\"sk-toggleable__content\"><pre>SimpleImputer()</pre></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">StandardScaler</label><div class=\"sk-toggleable__content\"><pre>StandardScaler()</pre></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">AutoML</label><div class=\"sk-toggleable__content\"><pre>AutoML(append_log=False, auto_augment=True, custom_hp={}, early_stop=False,\n",
" ensemble=False, estimator_list=&#x27;auto&#x27;, eval_method=&#x27;auto&#x27;,\n",
" fit_kwargs_by_estimator={}, hpo_method=&#x27;auto&#x27;, keep_search_state=False,\n",
" learner_selector=&#x27;sample&#x27;, log_file_name=&#x27;&#x27;, log_training_metric=False,\n",
" log_type=&#x27;better&#x27;, max_iter=None, mem_thres=4294967296, metric=&#x27;auto&#x27;,\n",
" metric_constraints=[], min_sample_size=10000, model_history=False,\n",
" n_concurrent_trials=1, n_jobs=-1, n_splits=5, pred_time_limit=inf,\n",
" retrain_full=True, sample=True, split_ratio=0.1, split_type=&#x27;auto&#x27;,\n",
" starting_points=&#x27;static&#x27;, task=&#x27;classification&#x27;, ...)</pre></div></div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('imputuer', SimpleImputer()),\n",
" ('standardizer', StandardScaler()),\n",
" ('automl',\n",
" AutoML(append_log=False, auto_augment=True, custom_hp={},\n",
" early_stop=False, ensemble=False, estimator_list='auto',\n",
" eval_method='auto', fit_kwargs_by_estimator={},\n",
" hpo_method='auto', keep_search_state=False,\n",
" learner_selector='sample', log_file_name='',\n",
" log_training_metric=False, log_type='better',\n",
" max_iter=None, mem_thres=4294967296, metric='auto',\n",
" metric_constraints=[], min_sample_size=10000,\n",
" model_history=False, n_concurrent_trials=1, n_jobs=-1,\n",
" n_splits=5, pred_time_limit=inf, retrain_full=True,\n",
" sample=True, split_ratio=0.1, split_type='auto',\n",
" starting_points='static', task='classification', ...))])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import set_config\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import StandardScaler\n",
"from flaml import AutoML\n",
"\n",
"set_config(display='diagram')\n",
"\n",
"imputer = SimpleImputer()\n",
"standardizer = StandardScaler()\n",
"automl = AutoML()\n",
"\n",
"automl_pipeline = Pipeline([\n",
" (\"imputuer\",imputer),\n",
" (\"standardizer\", standardizer),\n",
" (\"automl\", automl)\n",
"])\n",
"automl_pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run FLAML\n",
"In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default ML learners of FLAML are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"automl_settings = {\n",
" \"time_budget\": 60, # total running time in seconds\n",
" \"metric\": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'f1','log_loss','mae','mse','r2']\n",
" \"task\": 'classification', # task type \n",
" \"estimator_list\": ['xgboost','catboost','lgbm'],\n",
" \"log_file_name\": 'airlines_experiment.log', # flaml log file\n",
"}\n",
"pipeline_settings = {f\"automl__{key}\": value for key, value in automl_settings.items()}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[flaml.automl: 06-22 08:01:43] {2390} INFO - task = classification\n",
"[flaml.automl: 06-22 08:01:43] {2392} INFO - Data split method: stratified\n",
"[flaml.automl: 06-22 08:01:43] {2396} INFO - Evaluation method: holdout\n",
"[flaml.automl: 06-22 08:01:44] {2465} INFO - Minimizing error metric: 1-accuracy\n",
"[flaml.automl: 06-22 08:01:44] {2605} INFO - List of ML learners in AutoML Run: ['xgboost', 'catboost', 'lgbm']\n",
"[flaml.automl: 06-22 08:01:44] {2897} INFO - iteration 0, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:44] {3025} INFO - Estimated sufficient time budget=105341s. Estimated necessary time budget=116s.\n",
"[flaml.automl: 06-22 08:01:44] {3072} INFO - at 0.7s,\testimator xgboost's best error=0.3755,\tbest estimator xgboost's best error=0.3755\n",
"[flaml.automl: 06-22 08:01:44] {2897} INFO - iteration 1, current learner lgbm\n",
"[flaml.automl: 06-22 08:01:44] {3072} INFO - at 0.9s,\testimator lgbm's best error=0.3814,\tbest estimator xgboost's best error=0.3755\n",
"[flaml.automl: 06-22 08:01:44] {2897} INFO - iteration 2, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:45] {3072} INFO - at 1.3s,\testimator xgboost's best error=0.3755,\tbest estimator xgboost's best error=0.3755\n",
"[flaml.automl: 06-22 08:01:45] {2897} INFO - iteration 3, current learner lgbm\n",
"[flaml.automl: 06-22 08:01:45] {3072} INFO - at 1.5s,\testimator lgbm's best error=0.3814,\tbest estimator xgboost's best error=0.3755\n",
"[flaml.automl: 06-22 08:01:45] {2897} INFO - iteration 4, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:45] {3072} INFO - at 1.8s,\testimator xgboost's best error=0.3755,\tbest estimator xgboost's best error=0.3755\n",
"[flaml.automl: 06-22 08:01:45] {2897} INFO - iteration 5, current learner lgbm\n",
"[flaml.automl: 06-22 08:01:45] {3072} INFO - at 2.0s,\testimator lgbm's best error=0.3755,\tbest estimator xgboost's best error=0.3755\n",
"[flaml.automl: 06-22 08:01:45] {2897} INFO - iteration 6, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:46] {3072} INFO - at 2.3s,\testimator xgboost's best error=0.3724,\tbest estimator xgboost's best error=0.3724\n",
"[flaml.automl: 06-22 08:01:46] {2897} INFO - iteration 7, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:46] {3072} INFO - at 2.6s,\testimator xgboost's best error=0.3724,\tbest estimator xgboost's best error=0.3724\n",
"[flaml.automl: 06-22 08:01:46] {2897} INFO - iteration 8, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:47] {3072} INFO - at 3.1s,\testimator xgboost's best error=0.3657,\tbest estimator xgboost's best error=0.3657\n",
"[flaml.automl: 06-22 08:01:47] {2897} INFO - iteration 9, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:47] {3072} INFO - at 3.6s,\testimator xgboost's best error=0.3657,\tbest estimator xgboost's best error=0.3657\n",
"[flaml.automl: 06-22 08:01:47] {2897} INFO - iteration 10, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:48] {3072} INFO - at 4.8s,\testimator xgboost's best error=0.3592,\tbest estimator xgboost's best error=0.3592\n",
"[flaml.automl: 06-22 08:01:48] {2897} INFO - iteration 11, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:50] {3072} INFO - at 6.8s,\testimator xgboost's best error=0.3580,\tbest estimator xgboost's best error=0.3580\n",
"[flaml.automl: 06-22 08:01:50] {2897} INFO - iteration 12, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:51] {3072} INFO - at 8.1s,\testimator xgboost's best error=0.3580,\tbest estimator xgboost's best error=0.3580\n",
"[flaml.automl: 06-22 08:01:51] {2897} INFO - iteration 13, current learner lgbm\n",
"[flaml.automl: 06-22 08:01:52] {3072} INFO - at 8.4s,\testimator lgbm's best error=0.3644,\tbest estimator xgboost's best error=0.3580\n",
"[flaml.automl: 06-22 08:01:52] {2897} INFO - iteration 14, current learner lgbm\n",
"[flaml.automl: 06-22 08:01:52] {3072} INFO - at 8.7s,\testimator lgbm's best error=0.3644,\tbest estimator xgboost's best error=0.3580\n",
"[flaml.automl: 06-22 08:01:52] {2897} INFO - iteration 15, current learner lgbm\n",
"[flaml.automl: 06-22 08:01:53] {3072} INFO - at 9.3s,\testimator lgbm's best error=0.3644,\tbest estimator xgboost's best error=0.3580\n",
"[flaml.automl: 06-22 08:01:53] {2897} INFO - iteration 16, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:56] {3072} INFO - at 12.1s,\testimator xgboost's best error=0.3559,\tbest estimator xgboost's best error=0.3559\n",
"[flaml.automl: 06-22 08:01:56] {2897} INFO - iteration 17, current learner lgbm\n",
"[flaml.automl: 06-22 08:01:56] {3072} INFO - at 12.6s,\testimator lgbm's best error=0.3604,\tbest estimator xgboost's best error=0.3559\n",
"[flaml.automl: 06-22 08:01:56] {2897} INFO - iteration 18, current learner catboost\n",
"[flaml.automl: 06-22 08:01:56] {3072} INFO - at 13.0s,\testimator catboost's best error=0.3615,\tbest estimator xgboost's best error=0.3559\n",
"[flaml.automl: 06-22 08:01:56] {2897} INFO - iteration 19, current learner catboost\n",
"[flaml.automl: 06-22 08:01:57] {3072} INFO - at 13.7s,\testimator catboost's best error=0.3615,\tbest estimator xgboost's best error=0.3559\n",
"[flaml.automl: 06-22 08:01:57] {2897} INFO - iteration 20, current learner catboost\n",
"[flaml.automl: 06-22 08:01:57] {3072} INFO - at 13.9s,\testimator catboost's best error=0.3615,\tbest estimator xgboost's best error=0.3559\n",
"[flaml.automl: 06-22 08:01:57] {2897} INFO - iteration 21, current learner xgboost\n",
"[flaml.automl: 06-22 08:01:59] {3072} INFO - at 15.7s,\testimator xgboost's best error=0.3559,\tbest estimator xgboost's best error=0.3559\n",
"[flaml.automl: 06-22 08:01:59] {2897} INFO - iteration 22, current learner catboost\n",
"[flaml.automl: 06-22 08:02:00] {3072} INFO - at 16.5s,\testimator catboost's best error=0.3489,\tbest estimator catboost's best error=0.3489\n",
"[flaml.automl: 06-22 08:02:00] {2897} INFO - iteration 23, current learner catboost\n",
"[flaml.automl: 06-22 08:02:02] {3072} INFO - at 18.9s,\testimator catboost's best error=0.3489,\tbest estimator catboost's best error=0.3489\n",
"[flaml.automl: 06-22 08:02:02] {2897} INFO - iteration 24, current learner lgbm\n",
"[flaml.automl: 06-22 08:02:03] {3072} INFO - at 19.2s,\testimator lgbm's best error=0.3604,\tbest estimator catboost's best error=0.3489\n",
"[flaml.automl: 06-22 08:02:03] {2897} INFO - iteration 25, current learner catboost\n",
"[flaml.automl: 06-22 08:02:03] {3072} INFO - at 20.0s,\testimator catboost's best error=0.3472,\tbest estimator catboost's best error=0.3472\n",
"[flaml.automl: 06-22 08:02:03] {2897} INFO - iteration 26, current learner catboost\n",
"[flaml.automl: 06-22 08:02:06] {3072} INFO - at 22.2s,\testimator catboost's best error=0.3472,\tbest estimator catboost's best error=0.3472\n",
"[flaml.automl: 06-22 08:02:06] {2897} INFO - iteration 27, current learner lgbm\n",
"[flaml.automl: 06-22 08:02:06] {3072} INFO - at 22.6s,\testimator lgbm's best error=0.3604,\tbest estimator catboost's best error=0.3472\n",
"[flaml.automl: 06-22 08:02:06] {2897} INFO - iteration 28, current learner lgbm\n",
"[flaml.automl: 06-22 08:02:06] {3072} INFO - at 22.9s,\testimator lgbm's best error=0.3604,\tbest estimator catboost's best error=0.3472\n",
"[flaml.automl: 06-22 08:02:06] {2897} INFO - iteration 29, current learner catboost\n",
"[flaml.automl: 06-22 08:02:07] {3072} INFO - at 23.6s,\testimator catboost's best error=0.3472,\tbest estimator catboost's best error=0.3472\n",
"[flaml.automl: 06-22 08:02:07] {2897} INFO - iteration 30, current learner xgboost\n",
"[flaml.automl: 06-22 08:02:09] {3072} INFO - at 25.4s,\testimator xgboost's best error=0.3548,\tbest estimator catboost's best error=0.3472\n",
"[flaml.automl: 06-22 08:02:09] {2897} INFO - iteration 31, current learner catboost\n",
"[flaml.automl: 06-22 08:02:16] {3072} INFO - at 32.3s,\testimator catboost's best error=0.3388,\tbest estimator catboost's best error=0.3388\n",
"[flaml.automl: 06-22 08:02:16] {2897} INFO - iteration 32, current learner lgbm\n",
"[flaml.automl: 06-22 08:02:16] {3072} INFO - at 32.7s,\testimator lgbm's best error=0.3604,\tbest estimator catboost's best error=0.3388\n",
"[flaml.automl: 06-22 08:02:16] {2897} INFO - iteration 33, current learner catboost\n",
"[flaml.automl: 06-22 08:02:22] {3072} INFO - at 38.5s,\testimator catboost's best error=0.3388,\tbest estimator catboost's best error=0.3388\n",
"[flaml.automl: 06-22 08:02:22] {2897} INFO - iteration 34, current learner catboost\n",
"[flaml.automl: 06-22 08:02:43] {3072} INFO - at 59.6s,\testimator catboost's best error=0.3388,\tbest estimator catboost's best error=0.3388\n",
"[flaml.automl: 06-22 08:02:46] {3336} INFO - retrain catboost for 2.8s\n",
"[flaml.automl: 06-22 08:02:46] {3343} INFO - retrained model: <catboost.core.CatBoostClassifier object at 0x7fbeeb3859d0>\n",
"[flaml.automl: 06-22 08:02:46] {2636} INFO - fit succeeded\n",
"[flaml.automl: 06-22 08:02:46] {2637} INFO - Time taken to find the best model: 32.311296463012695\n"
]
},
{
"data": {
"text/html": [
"<style>#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;imputuer&#x27;, SimpleImputer()),\n",
" (&#x27;standardizer&#x27;, StandardScaler()),\n",
" (&#x27;automl&#x27;,\n",
" AutoML(append_log=False, auto_augment=True, custom_hp={},\n",
" early_stop=False, ensemble=False, estimator_list=&#x27;auto&#x27;,\n",
" eval_method=&#x27;auto&#x27;, fit_kwargs_by_estimator={},\n",
" hpo_method=&#x27;auto&#x27;, keep_search_state=False,\n",
" learner_selector=&#x27;sample&#x27;, log_file_name=&#x27;&#x27;,\n",
" log_training_metric=False, log_type=&#x27;better&#x27;,\n",
" max_iter=None, mem_thres=4294967296, metric=&#x27;auto&#x27;,\n",
" metric_constraints=[], min_sample_size=10000,\n",
" model_history=False, n_concurrent_trials=1, n_jobs=-1,\n",
" n_splits=5, pred_time_limit=inf, retrain_full=True,\n",
" sample=True, split_ratio=0.1, split_type=&#x27;auto&#x27;,\n",
" starting_points=&#x27;static&#x27;, task=&#x27;classification&#x27;, ...))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[(&#x27;imputuer&#x27;, SimpleImputer()),\n",
" (&#x27;standardizer&#x27;, StandardScaler()),\n",
" (&#x27;automl&#x27;,\n",
" AutoML(append_log=False, auto_augment=True, custom_hp={},\n",
" early_stop=False, ensemble=False, estimator_list=&#x27;auto&#x27;,\n",
" eval_method=&#x27;auto&#x27;, fit_kwargs_by_estimator={},\n",
" hpo_method=&#x27;auto&#x27;, keep_search_state=False,\n",
" learner_selector=&#x27;sample&#x27;, log_file_name=&#x27;&#x27;,\n",
" log_training_metric=False, log_type=&#x27;better&#x27;,\n",
" max_iter=None, mem_thres=4294967296, metric=&#x27;auto&#x27;,\n",
" metric_constraints=[], min_sample_size=10000,\n",
" model_history=False, n_concurrent_trials=1, n_jobs=-1,\n",
" n_splits=5, pred_time_limit=inf, retrain_full=True,\n",
" sample=True, split_ratio=0.1, split_type=&#x27;auto&#x27;,\n",
" starting_points=&#x27;static&#x27;, task=&#x27;classification&#x27;, ...))])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SimpleImputer</label><div class=\"sk-toggleable__content\"><pre>SimpleImputer()</pre></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-7\" type=\"checkbox\" ><label for=\"sk-estimator-id-7\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">StandardScaler</label><div class=\"sk-toggleable__content\"><pre>StandardScaler()</pre></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-8\" type=\"checkbox\" ><label for=\"sk-estimator-id-8\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">AutoML</label><div class=\"sk-toggleable__content\"><pre>AutoML(append_log=False, auto_augment=True, custom_hp={}, early_stop=False,\n",
" ensemble=False, estimator_list=&#x27;auto&#x27;, eval_method=&#x27;auto&#x27;,\n",
" fit_kwargs_by_estimator={}, hpo_method=&#x27;auto&#x27;, keep_search_state=False,\n",
" learner_selector=&#x27;sample&#x27;, log_file_name=&#x27;&#x27;, log_training_metric=False,\n",
" log_type=&#x27;better&#x27;, max_iter=None, mem_thres=4294967296, metric=&#x27;auto&#x27;,\n",
" metric_constraints=[], min_sample_size=10000, model_history=False,\n",
" n_concurrent_trials=1, n_jobs=-1, n_splits=5, pred_time_limit=inf,\n",
" retrain_full=True, sample=True, split_ratio=0.1, split_type=&#x27;auto&#x27;,\n",
" starting_points=&#x27;static&#x27;, task=&#x27;classification&#x27;, ...)</pre></div></div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('imputuer', SimpleImputer()),\n",
" ('standardizer', StandardScaler()),\n",
" ('automl',\n",
" AutoML(append_log=False, auto_augment=True, custom_hp={},\n",
" early_stop=False, ensemble=False, estimator_list='auto',\n",
" eval_method='auto', fit_kwargs_by_estimator={},\n",
" hpo_method='auto', keep_search_state=False,\n",
" learner_selector='sample', log_file_name='',\n",
" log_training_metric=False, log_type='better',\n",
" max_iter=None, mem_thres=4294967296, metric='auto',\n",
" metric_constraints=[], min_sample_size=10000,\n",
" model_history=False, n_concurrent_trials=1, n_jobs=-1,\n",
" n_splits=5, pred_time_limit=inf, retrain_full=True,\n",
" sample=True, split_ratio=0.1, split_type='auto',\n",
" starting_points='static', task='classification', ...))])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"automl_pipeline.fit(X_train, y_train, **pipeline_settings)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best ML leaner: xgboost\n",
"Best hyperparmeter config: {'n_estimators': 63, 'max_leaves': 1797, 'min_child_weight': 0.07275175679381725, 'learning_rate': 0.06234183309508761, 'subsample': 0.9814772488195874, 'colsample_bylevel': 0.810466508891351, 'colsample_bytree': 0.8005378817953572, 'reg_alpha': 0.5768305704485758, 'reg_lambda': 6.867180836557797, 'FLAML_sample_size': 364083}\n",
"Best accuracy on validation data: 0.6721\n",
"Training duration of best run: 15.45 s\n"
]
}
],
"source": [
"# Get the automl object from the pipeline\n",
"automl = automl_pipeline.steps[2][1]\n",
"\n",
"# Get the best config and best learner\n",
"print('Best ML leaner:', automl.best_estimator)\n",
"print('Best hyperparmeter config:', automl.best_config)\n",
"print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))\n",
"print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<flaml.model.XGBoostSklearnEstimator at 0x7f03a5eada00>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"automl.model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Persist the model binary file"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Persist the automl object as pickle file\n",
"import pickle\n",
"with open('automl.pkl', 'wb') as f:\n",
" pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Predicted labels [0 1 1 ... 0 1 0]\n",
"True labels [0 0 0 ... 1 0 1]\n",
"Predicted probas [0.3764987 0.6126277 0.699604 0.27359942 0.25294745]\n"
]
}
],
"source": [
"# Performance inference on the testing dataset\n",
"y_pred = automl_pipeline.predict(X_test)\n",
"print('Predicted labels', y_pred)\n",
"print('True labels', y_test)\n",
"y_pred_proba = automl_pipeline.predict_proba(X_test)[:,1]\n",
"print('Predicted probas ',y_pred_proba[:5])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.12 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"vscode": {
"interpreter": {
"hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because one or more lines are too long

View File

@ -1,808 +0,0 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c). All rights reserved.\n",
"\n",
"Licensed under the MIT License.\n",
"\n",
"# Troubleshooting HPO for fine-tuning pre-trained language models\n",
"\n",
"## 1. Introduction\n",
"\n",
"In this notebook, we demonstrate a procedure for troubleshooting HPO failure in fine-tuning pre-trained language models (introduced in the following paper):\n",
"\n",
"*[An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://arxiv.org/abs/2106.09204). Xueqing Liu, Chi Wang. ACL-IJCNLP 2021*\n",
"\n",
"Notes:\n",
"\n",
"*In this notebook, we only run each experiment 1 time for simplicity, which is different from the paper (3 times). To reproduce the paper's result, please run 3 repetitions and take the average scores.\n",
"\n",
"*Running this notebook takes about one hour.\n",
"\n",
"FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the legacy `[nlp]` options:\n",
"\n",
"```bash\n",
"pip install flaml[nlp]==0.7.1 # in higher version of flaml, the API for nlp tasks changed\n",
"```\n",
"\n",
"Our paper was developed under transformers version 3.4.0. We uninstall and reinstall transformers==3.4.0:\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"%pip install flaml[nlp]==0.7.1 # in higher version of flaml, the API for nlp tasks changed\n",
"%pip install transformers==3.4.0\n",
"from flaml.nlp import AutoTransformers\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Initial Experimental Study\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load dataset \n",
"\n",
"Load the dataset using AutoTransformer.prepare_data. In this notebook, we use the Microsoft Research Paraphrasing Corpus (MRPC) dataset and the Electra model as an example:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"console_args has no attribute pretrained_model_size, continue\n",
"console_args has no attribute dataset_subdataset_name, continue\n",
"console_args has no attribute algo_mode, continue\n",
"console_args has no attribute space_mode, continue\n",
"console_args has no attribute search_alg_args_mode, continue\n",
"console_args has no attribute algo_name, continue\n",
"console_args has no attribute pruner, continue\n",
"console_args has no attribute resplit_mode, continue\n",
"console_args has no attribute rep_id, continue\n",
"console_args has no attribute seed_data, continue\n",
"console_args has no attribute seed_transformers, continue\n",
"console_args has no attribute learning_rate, continue\n",
"console_args has no attribute weight_decay, continue\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Reusing dataset glue (/home/xliu127/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
"Loading cached processed dataset at /home/xliu127/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-6a78e5c95406457c.arrow\n",
"Loading cached processed dataset at /home/xliu127/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-e8d0f3e04c3b4588.arrow\n",
"Loading cached processed dataset at /home/xliu127/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-4b0966b394994163.arrow\n",
"Loading cached processed dataset at /home/xliu127/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-6a78e5c95406457c.arrow\n",
"Loading cached processed dataset at /home/xliu127/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-e8d0f3e04c3b4588.arrow\n",
"Loading cached processed dataset at /home/xliu127/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-4b0966b394994163.arrow\n"
]
}
],
"source": [
"autohf = AutoTransformers()\n",
"preparedata_setting = {\n",
" \"dataset_subdataset_name\": \"glue:mrpc\",\n",
" \"pretrained_model_size\": \"google/electra-base-discriminator:base\",\n",
" \"data_root_path\": \"data/\",\n",
" \"max_seq_length\": 128,\n",
" }\n",
"autohf.prepare_data(**preparedata_setting)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"### Running grid search\n",
"\n",
"First, we run grid search using Electra. By specifying `algo_mode=\"grid\"`, AutoTransformers will run the grid search algorithm. By specifying `space_mode=\"grid\"`, AutoTransformers will use the default grid search configuration recommended by the Electra paper:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"== Status ==<br>Memory usage on this node: 14.2/376.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/96 CPUs, 0/4 GPUs, 0.0/250.73 GiB heap, 0.0/76.9 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 67d99_00002 with accuracy=0.7254901960784313 and parameters={'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_epsilon': 1e-06, 'warmup_ratio': 0.1, 'per_device_train_batch_size': 32, 'hidden_dropout_prob': 0.1, 'attention_probs_dropout_prob': 0.1, 'num_train_epochs': 0.5, 'seed': 42}<br>Result logdir: /data/xliu127/projects/hyperopt/FLAML/notebook/data/checkpoint/dat=glue_subdat=mrpc_mod=grid_spa=grid_arg=dft_alg=grid_pru=None_pre=electra_presz=base_spt=ori_rep=0_sddt=43_sdhf=42_var1=None_var2=None/ray_result<br>Number of trials: 4/4 (4 TERMINATED)<br><br>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2021-06-16 10:45:35,071\tINFO tune.py:450 -- Total run time: 106.56 seconds (106.41 seconds for the tuning loop).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total running time: 106.57789206504822 seconds\n"
]
}
],
"source": [
"import transformers\n",
"autohf_settings = {\n",
" \"resources_per_trial\": {\"gpu\": 1, \"cpu\": 1},\n",
" \"num_samples\": 1,\n",
" \"time_budget\": 100000, # unlimited time budget\n",
" \"fp16\": True,\n",
" \"algo_mode\": \"grid\", # set the search algorithm to grid search\n",
" \"space_mode\": \"grid\", # set the search space to the recommended grid space\n",
" \"transformers_verbose\": transformers.logging.ERROR\n",
" }\n",
"validation_metric, analysis = autohf.fit(**autohf_settings)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the time for running grid search: "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"grid search for glue_mrpc took 106.57789206504822 seconds\n"
]
}
],
"source": [
"GST = autohf.last_run_duration\n",
"print(\"grid search for {} took {} seconds\".format(autohf.jobid_config.get_jobid_full_data_name(), GST))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After the HPO run finishes, generate the predictions and save it as a .zip file to be submitted to the glue website. Here we will need the library AzureUtils which is for storing the output information (e.g., analysis log, .zip file) locally and uploading the output to an azure blob container (e.g., if multiple jobs are executed in a cluster). If the azure key and container information is not specified, the output information will only be saved locally. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"remove_columns_ is deprecated and will be removed in the next major version of datasets. Use the dataset.remove_columns method instead.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cleaning the existing label column from test data\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" </style>\n",
" \n",
" <progress value='432' max='432' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [432/432 00:34]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"JobID(dat=['glue'], subdat='mrpc', mod='grid', spa='grid', arg='dft', alg='grid', pru='None', pre_full='google/electra-base-discriminator', pre='electra', presz='base', spt='ori', rep=0, sddt=43, sdhf=42, var1=None, var2=None)\n",
"Your output will not be synced to azure because azure key and container name are not specified\n",
"The path for saving the prediction .zip file is not specified, setting to data/ by default\n",
"Your output will not be synced to azure because azure key and container name are not specified\n",
"{'eval_accuracy': 0.7254901960784313, 'eval_f1': 0.8276923076923076, 'eval_loss': 0.516851007938385}\n"
]
}
],
"source": [
"predictions, test_metric = autohf.predict()\n",
"from flaml.nlp import AzureUtils\n",
"\n",
"print(autohf.jobid_config)\n",
"\n",
"azure_utils = AzureUtils(root_log_path=\"logs_test/\", autohf=autohf)\n",
"azure_utils.write_autohf_output(valid_metric=validation_metric,\n",
" predictions=predictions,\n",
" duration=GST)\n",
"print(validation_metric)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"The validation F1/accuracy we got was 92.4/89.5. After the above steps, you will find a .zip file for the predictions under data/result/. Submit the .zip file to the glue website. The test F1/accuracy we got was 90.4/86.7. As an example, we only run the experiment one time, but in general, we should run the experiment multiple repetitions and report the averaged validation and test accuracy."
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"### Running Random Search\n",
"\n",
"Next, we run random search with the same time budget as grid search:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def tune_hpo(time_budget, this_hpo_space):\n",
" autohf_settings = {\n",
" \"resources_per_trial\": {\"gpu\": 1, \"cpu\": 1},\n",
" \"num_samples\": -1,\n",
" \"time_budget\": time_budget,\n",
" \"fp16\": True,\n",
" \"algo_mode\": \"hpo\", # set the search algorithm mode to hpo\n",
" \"algo_name\": \"rs\",\n",
" \"space_mode\": \"cus\", # customized search space (this_hpo_space)\n",
" \"hpo_space\": this_hpo_space,\n",
" \"transformers_verbose\": transformers.logging.ERROR\n",
" }\n",
" validation_metric, analysis = autohf.fit(**autohf_settings)\n",
" predictions, test_metric = autohf.predict()\n",
" azure_utils = AzureUtils(root_log_path=\"logs_test/\", autohf=autohf)\n",
" azure_utils.write_autohf_output(valid_metric=validation_metric,\n",
" predictions=predictions,\n",
" duration=GST)\n",
" print(validation_metric)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"== Status ==<br>Memory usage on this node: 30.1/376.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/96 CPUs, 0/4 GPUs, 0.0/247.51 GiB heap, 0.0/75.93 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: c67b4_00003 with accuracy=0.7303921568627451 and parameters={'learning_rate': 4.030097060410288e-05, 'warmup_ratio': 0.06084844859190755, 'num_train_epochs': 0.5, 'per_device_train_batch_size': 16, 'weight_decay': 0.15742692948967135, 'attention_probs_dropout_prob': 0.08638900372842316, 'hidden_dropout_prob': 0.058245828039608386, 'seed': 42}<br>Result logdir: /data/xliu127/projects/hyperopt/FLAML/notebook/data/checkpoint/dat=glue_subdat=mrpc_mod=hpo_spa=cus_arg=dft_alg=rs_pru=None_pre=electra_presz=base_spt=ori_rep=0_sddt=43_sdhf=42_var1=None_var2=None/ray_result<br>Number of trials: 8/infinite (8 TERMINATED)<br><br>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(pid=50964)\u001b[0m {'eval_loss': 0.5942569971084595, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.10434782608695652}\n",
"\u001b[2m\u001b[36m(pid=50964)\u001b[0m {'eval_loss': 0.5942569971084595, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.10434782608695652}\n",
"\u001b[2m\u001b[36m(pid=50948)\u001b[0m {'eval_loss': 0.649192214012146, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.2}\n",
"\u001b[2m\u001b[36m(pid=50948)\u001b[0m {'eval_loss': 0.649192214012146, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.2}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2021-06-16 10:48:21,624\tINFO tune.py:450 -- Total run time: 114.32 seconds (109.41 seconds for the tuning loop).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total running time: 114.35665488243103 seconds\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" </style>\n",
" \n",
" <progress value='432' max='432' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [432/432 00:33]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Your output will not be synced to azure because azure key and container name are not specified\n",
"The path for saving the prediction .zip file is not specified, setting to data/ by default\n",
"Your output will not be synced to azure because azure key and container name are not specified\n",
"{'eval_accuracy': 0.7328431372549019, 'eval_f1': 0.8320493066255777, 'eval_loss': 0.5411379933357239}\n"
]
}
],
"source": [
"hpo_space_full = {\n",
" \"learning_rate\": {\"l\": 3e-5, \"u\": 1.5e-4, \"space\": \"log\"},\n",
" \"warmup_ratio\": {\"l\": 0, \"u\": 0.2, \"space\": \"linear\"},\n",
" \"num_train_epochs\": [3],\n",
" \"per_device_train_batch_size\": [16, 32, 64],\n",
" \"weight_decay\": {\"l\": 0.0, \"u\": 0.3, \"space\": \"linear\"},\n",
" \"attention_probs_dropout_prob\": {\"l\": 0, \"u\": 0.2, \"space\": \"linear\"},\n",
" \"hidden_dropout_prob\": {\"l\": 0, \"u\": 0.2, \"space\": \"linear\"},\n",
" }\n",
"\n",
"tune_hpo(GST, hpo_space_full)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"The validation F1/accuracy we got was 93.5/90.9. Similarly, we can submit the .zip file to the glue website. The test F1/accuaracy we got was 81.6/70.2. "
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## 3. Troubleshooting HPO Failures\n",
"\n",
"Since the validation accuracy is larger than grid search while the test accuracy is smaller, HPO has overfitting. We reduce the search space:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"== Status ==<br>Memory usage on this node: 26.5/376.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/96 CPUs, 0/4 GPUs, 0.0/247.51 GiB heap, 0.0/75.93 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 234d8_00003 with accuracy=0.7475490196078431 and parameters={'learning_rate': 0.00011454435497690623, 'warmup_ratio': 0.1, 'num_train_epochs': 0.5, 'per_device_train_batch_size': 16, 'weight_decay': 0.06370173320348284, 'attention_probs_dropout_prob': 0.03636499344142013, 'hidden_dropout_prob': 0.03668090197068676, 'seed': 42}<br>Result logdir: /data/xliu127/projects/hyperopt/FLAML/notebook/data/checkpoint/dat=glue_subdat=mrpc_mod=hpo_spa=cus_arg=dft_alg=rs_pru=None_pre=electra_presz=base_spt=ori_rep=0_sddt=43_sdhf=42_var1=None_var2=None/ray_result<br>Number of trials: 6/infinite (6 TERMINATED)<br><br>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(pid=54411)\u001b[0m {'eval_loss': 0.624100387096405, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=54411)\u001b[0m {'eval_loss': 0.624100387096405, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=54411)\u001b[0m {'eval_loss': 0.624100387096405, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=54417)\u001b[0m {'eval_loss': 0.5938675999641418, 'eval_accuracy': 0.7156862745098039, 'eval_f1': 0.8258258258258258, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=54417)\u001b[0m {'eval_loss': 0.5938675999641418, 'eval_accuracy': 0.7156862745098039, 'eval_f1': 0.8258258258258258, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=54417)\u001b[0m {'eval_loss': 0.5938675999641418, 'eval_accuracy': 0.7156862745098039, 'eval_f1': 0.8258258258258258, 'epoch': 0.5}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2021-06-16 10:51:34,598\tINFO tune.py:450 -- Total run time: 151.57 seconds (136.77 seconds for the tuning loop).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total running time: 151.59901237487793 seconds\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" </style>\n",
" \n",
" <progress value='432' max='432' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [432/432 00:33]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Your output will not be synced to azure because azure key and container name are not specified\n",
"The path for saving the prediction .zip file is not specified, setting to data/ by default\n",
"Your output will not be synced to azure because azure key and container name are not specified\n",
"{'eval_accuracy': 0.7475490196078431, 'eval_f1': 0.8325203252032519, 'eval_loss': 0.5056071877479553}\n"
]
}
],
"source": [
"hpo_space_fixwr = {\n",
" \"learning_rate\": {\"l\": 3e-5, \"u\": 1.5e-4, \"space\": \"log\"},\n",
" \"warmup_ratio\": [0.1],\n",
" \"num_train_epochs\": [3],\n",
" \"per_device_train_batch_size\": [16, 32, 64],\n",
" \"weight_decay\": {\"l\": 0.0, \"u\": 0.3, \"space\": \"linear\"},\n",
" \"attention_probs_dropout_prob\": {\"l\": 0, \"u\": 0.2, \"space\": \"linear\"},\n",
" \"hidden_dropout_prob\": {\"l\": 0, \"u\": 0.2, \"space\": \"linear\"},\n",
" }\n",
"tune_hpo(GST, hpo_space_fixwr)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The validation F1/accuracy we got was 92.6/89.7, the test F1/accuracy was 85.9/78.7, therefore overfitting still exists and we further reduce the space: "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"== Status ==<br>Memory usage on this node: 29.6/376.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/96 CPUs, 0/4 GPUs, 0.0/247.46 GiB heap, 0.0/75.93 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 96a67_00003 with accuracy=0.7107843137254902 and parameters={'learning_rate': 7.862589064613256e-05, 'warmup_ratio': 0.1, 'num_train_epochs': 0.5, 'per_device_train_batch_size': 32, 'weight_decay': 0.0, 'attention_probs_dropout_prob': 0.1, 'hidden_dropout_prob': 0.1, 'seed': 42}<br>Result logdir: /data/xliu127/projects/hyperopt/FLAML/notebook/data/checkpoint/dat=glue_subdat=mrpc_mod=hpo_spa=cus_arg=dft_alg=rs_pru=None_pre=electra_presz=base_spt=ori_rep=0_sddt=43_sdhf=42_var1=None_var2=None/ray_result<br>Number of trials: 6/infinite (6 TERMINATED)<br><br>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(pid=57835)\u001b[0m {'eval_loss': 0.5822290778160095, 'eval_accuracy': 0.7058823529411765, 'eval_f1': 0.8181818181818181, 'epoch': 0.5043478260869565}\n",
"\u001b[2m\u001b[36m(pid=57835)\u001b[0m {'eval_loss': 0.5822290778160095, 'eval_accuracy': 0.7058823529411765, 'eval_f1': 0.8181818181818181, 'epoch': 0.5043478260869565}\n",
"\u001b[2m\u001b[36m(pid=57835)\u001b[0m {'eval_loss': 0.5822290778160095, 'eval_accuracy': 0.7058823529411765, 'eval_f1': 0.8181818181818181, 'epoch': 0.5043478260869565}\n",
"\u001b[2m\u001b[36m(pid=57835)\u001b[0m {'eval_loss': 0.5822290778160095, 'eval_accuracy': 0.7058823529411765, 'eval_f1': 0.8181818181818181, 'epoch': 0.5043478260869565}\n",
"\u001b[2m\u001b[36m(pid=57836)\u001b[0m {'eval_loss': 0.6087244749069214, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.10344827586206896}\n",
"\u001b[2m\u001b[36m(pid=57836)\u001b[0m {'eval_loss': 0.6087244749069214, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.10344827586206896}\n",
"\u001b[2m\u001b[36m(pid=57836)\u001b[0m {'eval_loss': 0.6087244749069214, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.10344827586206896}\n",
"\u001b[2m\u001b[36m(pid=57836)\u001b[0m {'eval_loss': 0.6087244749069214, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.10344827586206896}\n",
"\u001b[2m\u001b[36m(pid=57839)\u001b[0m {'eval_loss': 0.5486209392547607, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8141321044546851, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=57839)\u001b[0m {'eval_loss': 0.5486209392547607, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8141321044546851, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=57839)\u001b[0m {'eval_loss': 0.5486209392547607, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8141321044546851, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=57839)\u001b[0m {'eval_loss': 0.5486209392547607, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8141321044546851, 'epoch': 0.5}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2021-06-16 10:54:14,542\tINFO tune.py:450 -- Total run time: 117.99 seconds (112.99 seconds for the tuning loop).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total running time: 118.01927375793457 seconds\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" </style>\n",
" \n",
" <progress value='432' max='432' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [432/432 00:33]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Your output will not be synced to azure because azure key and container name are not specified\n",
"The path for saving the prediction .zip file is not specified, setting to data/ by default\n",
"Your output will not be synced to azure because azure key and container name are not specified\n",
"{'eval_accuracy': 0.7181372549019608, 'eval_f1': 0.8174962292609351, 'eval_loss': 0.5494586229324341}\n"
]
}
],
"source": [
"hpo_space_min = {\n",
" \"learning_rate\": {\"l\": 3e-5, \"u\": 1.5e-4, \"space\": \"log\"},\n",
" \"warmup_ratio\": [0.1],\n",
" \"num_train_epochs\": [3],\n",
" \"per_device_train_batch_size\": [16, 32, 64],\n",
" \"weight_decay\": [0.0],\n",
" \"attention_probs_dropout_prob\": [0.1],\n",
" \"hidden_dropout_prob\": [0.1],\n",
" }\n",
"tune_hpo(GST, hpo_space_min)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"The validation F1/accuracy we got was 90.4/86.7, test F1/accuracy was 83.0/73.0. Since the validation accuracy is below grid search, we increase the budget to 4 * GST:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"== Status ==<br>Memory usage on this node: 26.2/376.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/96 CPUs, 0/4 GPUs, 0.0/247.46 GiB heap, 0.0/75.93 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: f5d31_00005 with accuracy=0.7352941176470589 and parameters={'learning_rate': 3.856175093679045e-05, 'warmup_ratio': 0.1, 'num_train_epochs': 0.5, 'per_device_train_batch_size': 16, 'weight_decay': 0.0, 'attention_probs_dropout_prob': 0.1, 'hidden_dropout_prob': 0.1, 'seed': 42}<br>Result logdir: /data/xliu127/projects/hyperopt/FLAML/notebook/data/checkpoint/dat=glue_subdat=mrpc_mod=hpo_spa=cus_arg=dft_alg=rs_pru=None_pre=electra_presz=base_spt=ori_rep=0_sddt=43_sdhf=42_var1=None_var2=None/ray_result<br>Number of trials: 16/infinite (16 TERMINATED)<br><br>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(pid=61251)\u001b[0m {'eval_loss': 0.6236899495124817, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=61251)\u001b[0m {'eval_loss': 0.6236899495124817, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=61251)\u001b[0m {'eval_loss': 0.6236899495124817, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=61251)\u001b[0m {'eval_loss': 0.6236899495124817, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=61251)\u001b[0m {'eval_loss': 0.6236899495124817, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.5}\n",
"\u001b[2m\u001b[36m(pid=61255)\u001b[0m {'eval_loss': 0.6249027848243713, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.3}\n",
"\u001b[2m\u001b[36m(pid=61255)\u001b[0m {'eval_loss': 0.6249027848243713, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.3}\n",
"\u001b[2m\u001b[36m(pid=61255)\u001b[0m {'eval_loss': 0.6249027848243713, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.3}\n",
"\u001b[2m\u001b[36m(pid=61255)\u001b[0m {'eval_loss': 0.6249027848243713, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.3}\n",
"\u001b[2m\u001b[36m(pid=61255)\u001b[0m {'eval_loss': 0.6249027848243713, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.3}\n",
"\u001b[2m\u001b[36m(pid=61236)\u001b[0m {'eval_loss': 0.6138392686843872, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.20689655172413793}\n",
"\u001b[2m\u001b[36m(pid=61236)\u001b[0m {'eval_loss': 0.6138392686843872, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.20689655172413793}\n",
"\u001b[2m\u001b[36m(pid=61236)\u001b[0m {'eval_loss': 0.6138392686843872, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.20689655172413793}\n",
"\u001b[2m\u001b[36m(pid=61236)\u001b[0m {'eval_loss': 0.6138392686843872, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.20689655172413793}\n",
"\u001b[2m\u001b[36m(pid=61236)\u001b[0m {'eval_loss': 0.6138392686843872, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'epoch': 0.20689655172413793}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2021-06-16 11:03:23,308\tINFO tune.py:450 -- Total run time: 507.09 seconds (445.79 seconds for the tuning loop).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total running time: 507.15925645828247 seconds\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" </style>\n",
" \n",
" <progress value='432' max='432' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [432/432 00:34]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Your output will not be synced to azure because azure key and container name are not specified\n",
"The path for saving the prediction .zip file is not specified, setting to data/ by default\n",
"Your output will not be synced to azure because azure key and container name are not specified\n",
"{'eval_accuracy': 0.7401960784313726, 'eval_f1': 0.8333333333333334, 'eval_loss': 0.5303606986999512}\n"
]
}
],
"source": [
"hpo_space_min = {\n",
" \"learning_rate\": {\"l\": 3e-5, \"u\": 1.5e-4, \"space\": \"log\"},\n",
" \"warmup_ratio\": [0.1],\n",
" \"num_train_epochs\": [3],\n",
" \"per_device_train_batch_size\": [32],\n",
" \"weight_decay\": [0.0],\n",
" \"attention_probs_dropout_prob\": [0.1],\n",
" \"hidden_dropout_prob\": [0.1],\n",
" }\n",
"tune_hpo(4 * GST, hpo_space_min)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The validation F1/accuracy we got was 93.5/91.1, where the accuracy outperforms grid search. The test F1/accuracy was 90.1/86.1. As a result, random search with 4*GST and the minimum space overfits. We stop the troubleshooting process because the search space cannot be further reduced."
]
}
],
"metadata": {
"interpreter": {
"hash": "bfcd9a6a9254a5e160761a1fd7a9e444f011592c6770d9f4180dde058a9df5dd"
},
"kernelspec": {
"display_name": "Python 3.7.7 64-bit ('flaml': conda)",
"name": "python3"
},
"language_info": {
"name": "python",
"version": ""
}
},
"nbformat": 4,
"nbformat_minor": 1
}

View File

@ -1,975 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook uses flaml to finetune a transformer model from Huggingface transformers library.\n",
"\n",
"**Requirements.** This notebook has additional requirements:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# %pip install torch transformers datasets ipywidgets flaml[blendsearch,ray]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"MODEL_CHECKPOINT = \"distilbert-base-uncased\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer(\"this is a test\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"TASK = \"cola\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import datasets"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Reusing dataset glue (/home/ec2-user/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n"
]
}
],
"source": [
"raw_dataset = datasets.load_dataset(\"glue\", TASK)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# define tokenization function used to process data\n",
"COLUMN_NAME = \"sentence\"\n",
"def tokenize(examples):\n",
" return tokenizer(examples[COLUMN_NAME], truncation=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0dcf9ca8ce024a2b832606a6a3219b17",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c58845729f0a4261830ad679891e7c77",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9716d177a40748008cc6089e3d52a1d5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"encoded_dataset = raw_dataset.map(tokenize, batched=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
" 'idx': 0,\n",
" 'input_ids': [101,\n",
" 2256,\n",
" 2814,\n",
" 2180,\n",
" 1005,\n",
" 1056,\n",
" 4965,\n",
" 2023,\n",
" 4106,\n",
" 1010,\n",
" 2292,\n",
" 2894,\n",
" 1996,\n",
" 2279,\n",
" 2028,\n",
" 2057,\n",
" 16599,\n",
" 1012,\n",
" 102],\n",
" 'label': 1,\n",
" 'sentence': \"Our friends won't buy this analysis, let alone the next one we propose.\"}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoded_dataset[\"train\"][0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForSequenceClassification"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"NUM_LABELS = 2\n",
"model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_LABELS)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DistilBertForSequenceClassification(\n",
" (distilbert): DistilBertModel(\n",
" (embeddings): Embeddings(\n",
" (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
" (position_embeddings): Embedding(512, 768)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (transformer): Transformer(\n",
" (layer): ModuleList(\n",
" (0): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" (1): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" (2): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" (3): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" (4): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" (5): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (pre_classifier): Linear(in_features=768, out_features=768, bias=True)\n",
" (classifier): Linear(in_features=768, out_features=2, bias=True)\n",
" (dropout): Dropout(p=0.2, inplace=False)\n",
")"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Metric"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"metric = datasets.load_metric(\"glue\", TASK)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Metric(name: \"glue\", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: \"\"\"\n",
"Compute GLUE evaluation metric associated to each GLUE dataset.\n",
"Args:\n",
" predictions: list of predictions to score.\n",
" Each translation should be tokenized into a list of tokens.\n",
" references: list of lists of references for each translation.\n",
" Each reference should be tokenized into a list of tokens.\n",
"Returns: depending on the GLUE subset, one or several of:\n",
" \"accuracy\": Accuracy\n",
" \"f1\": F1 score\n",
" \"pearson\": Pearson Correlation\n",
" \"spearmanr\": Spearman Correlation\n",
" \"matthews_correlation\": Matthew Correlation\n",
"Examples:\n",
"\n",
" >>> glue_metric = datasets.load_metric('glue', 'sst2') # 'sst2' or any of [\"mnli\", \"mnli_mismatched\", \"mnli_matched\", \"qnli\", \"rte\", \"wnli\", \"hans\"]\n",
" >>> references = [0, 1]\n",
" >>> predictions = [0, 1]\n",
" >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
" >>> print(results)\n",
" {'accuracy': 1.0}\n",
"\n",
" >>> glue_metric = datasets.load_metric('glue', 'mrpc') # 'mrpc' or 'qqp'\n",
" >>> references = [0, 1]\n",
" >>> predictions = [0, 1]\n",
" >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
" >>> print(results)\n",
" {'accuracy': 1.0, 'f1': 1.0}\n",
"\n",
" >>> glue_metric = datasets.load_metric('glue', 'stsb')\n",
" >>> references = [0., 1., 2., 3., 4., 5.]\n",
" >>> predictions = [0., 1., 2., 3., 4., 5.]\n",
" >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
" >>> print({\"pearson\": round(results[\"pearson\"], 2), \"spearmanr\": round(results[\"spearmanr\"], 2)})\n",
" {'pearson': 1.0, 'spearmanr': 1.0}\n",
"\n",
" >>> glue_metric = datasets.load_metric('glue', 'cola')\n",
" >>> references = [0, 1]\n",
" >>> predictions = [0, 1]\n",
" >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
" >>> print(results)\n",
" {'matthews_correlation': 1.0}\n",
"\"\"\", stored examples: 0)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metric"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"def compute_metrics(eval_pred):\n",
" predictions, labels = eval_pred\n",
" predictions = np.argmax(predictions, axis=1)\n",
" return metric.compute(predictions=predictions, references=labels)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training (aka Finetuning)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"from transformers import Trainer\n",
"from transformers import TrainingArguments"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"args = TrainingArguments(\n",
" output_dir='output',\n",
" do_eval=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"trainer = Trainer(\n",
" model=model,\n",
" args=args,\n",
" train_dataset=encoded_dataset[\"train\"],\n",
" eval_dataset=encoded_dataset[\"validation\"],\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" </style>\n",
" \n",
" <progress value='1591' max='3207' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [1591/3207 1:03:06 < 1:04:11, 0.42 it/s, Epoch 1.49/3]\n",
" </div>\n",
" <table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>Step</th>\n",
" <th>Training Loss</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>500</td>\n",
" <td>0.571000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1000</td>\n",
" <td>0.515400</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1500</td>\n",
" <td>0.356100</td>\n",
" </tr>\n",
" </tbody>\n",
"</table><p>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"trainer.train()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hyperparameter Optimization\n",
"\n",
"`flaml.tune` is a module for economical hyperparameter tuning. It frees users from manually tuning many hyperparameters for a software, such as machine learning training procedures. \n",
"The API is compatible with ray tune.\n",
"\n",
"### Step 1. Define training method\n",
"\n",
"We define a function `train_distilbert(config: dict)` that accepts a hyperparameter configuration dict `config`. The specific configs will be generated by flaml's search algorithm in a given search space.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import flaml\n",
"\n",
"def train_distilbert(config: dict):\n",
"\n",
" # Load CoLA dataset and apply tokenizer\n",
" cola_raw = datasets.load_dataset(\"glue\", TASK)\n",
" cola_encoded = cola_raw.map(tokenize, batched=True)\n",
" train_dataset, eval_dataset = cola_encoded[\"train\"], cola_encoded[\"validation\"]\n",
"\n",
" model = AutoModelForSequenceClassification.from_pretrained(\n",
" MODEL_CHECKPOINT, num_labels=NUM_LABELS\n",
" )\n",
"\n",
" metric = datasets.load_metric(\"glue\", TASK)\n",
" def compute_metrics(eval_pred):\n",
" predictions, labels = eval_pred\n",
" predictions = np.argmax(predictions, axis=1)\n",
" return metric.compute(predictions=predictions, references=labels)\n",
"\n",
" training_args = TrainingArguments(\n",
" output_dir='.',\n",
" do_eval=False,\n",
" disable_tqdm=True,\n",
" logging_steps=20000,\n",
" save_total_limit=0,\n",
" **config,\n",
" )\n",
"\n",
" trainer = Trainer(\n",
" model,\n",
" training_args,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=eval_dataset,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
" )\n",
"\n",
" # train model\n",
" trainer.train()\n",
"\n",
" # evaluate model\n",
" eval_output = trainer.evaluate()\n",
"\n",
" # report the metric to optimize\n",
" flaml.tune.report(\n",
" loss=eval_output[\"eval_loss\"],\n",
" matthews_correlation=eval_output[\"eval_matthews_correlation\"],\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 2. Define the search\n",
"\n",
"We are now ready to define our search. This includes:\n",
"\n",
"- The `search_space` for our hyperparameters\n",
"- The metric and the mode ('max' or 'min') for optimization\n",
"- The constraints (`n_cpus`, `n_gpus`, `num_samples`, and `time_budget_s`)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"max_num_epoch = 64\n",
"search_space = {\n",
" # You can mix constants with search space objects.\n",
" \"num_train_epochs\": flaml.tune.loguniform(1, max_num_epoch),\n",
" \"learning_rate\": flaml.tune.loguniform(1e-6, 1e-4),\n",
" \"adam_epsilon\": flaml.tune.loguniform(1e-9, 1e-7),\n",
" \"adam_beta1\": flaml.tune.uniform(0.8, 0.99),\n",
" \"adam_beta2\": flaml.tune.loguniform(98e-2, 9999e-4),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# optimization objective\n",
"HP_METRIC, MODE = \"matthews_correlation\", \"max\"\n",
"\n",
"# resources\n",
"num_cpus = 4\n",
"num_gpus = 4\n",
"\n",
"# constraints\n",
"num_samples = -1 # number of trials, -1 means unlimited\n",
"time_budget_s = 3600 # time budget in seconds"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 3. Launch with `flaml.tune.run`\n",
"\n",
"We are now ready to launch the tuning using `flaml.tune.run`:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/miniconda3/envs/myflaml/lib/python3.8/site-packages/ray/_private/services.py:238: UserWarning: Not all Ray Dashboard dependencies were found. To use the dashboard please install Ray using `pip install ray[default]`. To disable this message, set RAY_DISABLE_IMPORT_WARNING env var to '1'.\n",
" warnings.warn(warning_message)\n",
"2021-12-01 23:35:54,348\tWARNING function_runner.py:558 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tuning started...\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
},
{
"data": {
"text/html": [
"== Status ==<br>Memory usage on this node: 4.3/7.7 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4.0/4 CPUs, 4.0/4 GPUs, 0.0/2.34 GiB heap, 0.0/1.17 GiB objects<br>Result logdir: /home/ec2-user/FLAML/notebook/logs/train_distilbert_2021-12-01_23-35-54<br>Number of trials: 1/infinite (1 RUNNING)<br><br>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"== Status ==<br>Memory usage on this node: 4.5/7.7 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4.0/4 CPUs, 4.0/4 GPUs, 0.0/2.34 GiB heap, 0.0/1.17 GiB objects<br>Result logdir: /home/ec2-user/FLAML/notebook/logs/train_distilbert_2021-12-01_23-35-54<br>Number of trials: 2/infinite (1 PENDING, 1 RUNNING)<br><br>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"== Status ==<br>Memory usage on this node: 4.6/7.7 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4.0/4 CPUs, 4.0/4 GPUs, 0.0/2.34 GiB heap, 0.0/1.17 GiB objects<br>Result logdir: /home/ec2-user/FLAML/notebook/logs/train_distilbert_2021-12-01_23-35-54<br>Number of trials: 2/infinite (1 PENDING, 1 RUNNING)<br><br>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m Reusing dataset glue (/home/ec2-user/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 22%|██▏ | 2/9 [00:00<00:00, 19.41ba/s]\n",
" 56%|█████▌ | 5/9 [00:00<00:00, 20.98ba/s]\n",
" 89%|████████▉ | 8/9 [00:00<00:00, 21.75ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 24.49ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 42.79ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 41.48ba/s]\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m To disable this warning, you can either:\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m To disable this warning, you can either:\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
}
],
"source": [
"import time\n",
"import ray\n",
"start_time = time.time()\n",
"ray.shutdown()\n",
"ray.init(num_cpus=num_cpus, num_gpus=num_gpus)\n",
"\n",
"print(\"Tuning started...\")\n",
"analysis = flaml.tune.run(\n",
" train_distilbert,\n",
" search_alg=flaml.CFO(\n",
" space=search_space,\n",
" metric=HP_METRIC,\n",
" mode=MODE,\n",
" low_cost_partial_config={\"num_train_epochs\": 1}),\n",
" # uncomment the following if scheduler = 'asha',\n",
" # max_resource=max_num_epoch, min_resource=1,\n",
" resources_per_trial={\"gpu\": num_gpus, \"cpu\": num_cpus},\n",
" local_dir='logs/',\n",
" num_samples=num_samples,\n",
" time_budget_s=time_budget_s,\n",
" use_ray=True,\n",
")\n",
"\n",
"ray.shutdown()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"n_trials=22\n",
"time=3999.769361972809\n",
"Best model eval matthews_correlation: 0.5699\n",
"Best model parameters: {'num_train_epochs': 15.580684188655825, 'learning_rate': 1.2851507818900338e-05, 'adam_epsilon': 8.134982521948352e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9971094424784387}\n"
]
}
],
"source": [
"best_trial = analysis.get_best_trial(HP_METRIC, MODE, \"all\")\n",
"metric = best_trial.metric_analysis[HP_METRIC][MODE]\n",
"print(f\"n_trials={len(analysis.trials)}\")\n",
"print(f\"time={time.time()-start_time}\")\n",
"print(f\"Best model eval {HP_METRIC}: {metric:.4f}\")\n",
"print(f\"Best model parameters: {best_trial.config}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Next Steps\n",
"\n",
"Notice that we only reported the metric with `flaml.tune.report` at the end of full training loop. It is possible to enable reporting of intermediate performance - allowing early stopping - as follows:\n",
"\n",
"- Huggingface provides _Callbacks_ which can be used to insert the `flaml.tune.report` call inside the training loop\n",
"- Make sure to set `do_eval=True` in the `TrainingArguments` provided to `Trainer` and adjust the evaluation frequency accordingly"
]
}
],
"metadata": {
"interpreter": {
"hash": "1cfcceddaeccda27c3cce104660d474924e2ba82887c0e8e481b6ede3743c483"
},
"kernelspec": {
"display_name": "Python 3.8.5 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
},
"metadata": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,286 +0,0 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tune neural networks with lexicographic preference across objectives\n",
"This example is to tune neural networks model with two objectives \"error_rate\", \"flops\" on FashionMnist dataset. \n",
"\n",
"**Requirements.** This notebook requires:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# %pip install torch torchvision flaml[blendsearch,ray] thop"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import thop\n",
"import torch.nn as nn\n",
"from flaml import tune\n",
"import torch.nn.functional as F\n",
"import torchvision\n",
"import numpy as np\n",
"import os\n",
"\n",
"DEVICE = torch.device(\"cpu\")\n",
"BATCHSIZE = 128\n",
"N_TRAIN_EXAMPLES = BATCHSIZE * 30\n",
"N_VALID_EXAMPLES = BATCHSIZE * 10\n",
"data_dir = os.path.abspath(\"data\")\n",
"\n",
"train_dataset = torchvision.datasets.FashionMNIST(\n",
" data_dir,\n",
" train=True,\n",
" download=True,\n",
" transform=torchvision.transforms.ToTensor(),\n",
")\n",
"\n",
"train_loader = torch.utils.data.DataLoader(\n",
" torch.utils.data.Subset(train_dataset, list(range(N_TRAIN_EXAMPLES))),\n",
" batch_size=BATCHSIZE,\n",
" shuffle=True,\n",
")\n",
"\n",
"val_dataset = torchvision.datasets.FashionMNIST(\n",
" data_dir, train=False, transform=torchvision.transforms.ToTensor()\n",
")\n",
"\n",
"val_loader = torch.utils.data.DataLoader(\n",
" torch.utils.data.Subset(val_dataset, list(range(N_VALID_EXAMPLES))),\n",
" batch_size=BATCHSIZE,\n",
" shuffle=True,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specify the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def define_model(configuration):\n",
" n_layers = configuration[\"n_layers\"]\n",
" layers = []\n",
" in_features = 28 * 28\n",
" for i in range(n_layers):\n",
" out_features = configuration[\"n_units_l{}\".format(i)]\n",
" layers.append(nn.Linear(in_features, out_features))\n",
" layers.append(nn.ReLU())\n",
" p = configuration[\"dropout_{}\".format(i)]\n",
" layers.append(nn.Dropout(p))\n",
" in_features = out_features\n",
" layers.append(nn.Linear(in_features, 10))\n",
" layers.append(nn.LogSoftmax(dim=1))\n",
" return nn.Sequential(*layers)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def train_model(model, optimizer, train_loader):\n",
" model.train()\n",
" for batch_idx, (data, target) in enumerate(train_loader):\n",
" data, target = data.view(-1, 28 * 28).to(DEVICE), target.to(DEVICE)\n",
" optimizer.zero_grad()\n",
" F.nll_loss(model(data), target).backward()\n",
" optimizer.step()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Metrics "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def eval_model(model, valid_loader):\n",
" model.eval()\n",
" correct = 0\n",
" with torch.no_grad():\n",
" for batch_idx, (data, target) in enumerate(valid_loader):\n",
" data, target = data.view(-1, 28 * 28).to(DEVICE), target.to(DEVICE)\n",
" pred = model(data).argmax(dim=1, keepdim=True)\n",
" correct += pred.eq(target.view_as(pred)).sum().item()\n",
"\n",
" accuracy = correct / N_VALID_EXAMPLES\n",
" flops, params = thop.profile(\n",
" model, inputs=(torch.randn(1, 28 * 28).to(DEVICE),), verbose=False\n",
" )\n",
" return np.log2(flops), 1 - accuracy, params"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate function"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_function(configuration):\n",
" model = define_model(configuration).to(DEVICE)\n",
" optimizer = torch.optim.Adam(model.parameters(), configuration[\"lr\"])\n",
" n_epoch = configuration[\"n_epoch\"]\n",
" for epoch in range(n_epoch):\n",
" train_model(model, optimizer, train_loader)\n",
" flops, error_rate, params = eval_model(model, val_loader)\n",
" return {\"error_rate\": error_rate, \"flops\": flops, \"params\": params}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lexicographic information across objectives"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"lexico_objectives = {}\n",
"lexico_objectives[\"metrics\"] = [\"error_rate\", \"flops\"]\n",
"lexico_objectives[\"tolerances\"] = {\"error_rate\": 0.02, \"flops\": 0.0}\n",
"lexico_objectives[\"targets\"] = {\"error_rate\": 0.0, \"flops\": 0.0}\n",
"lexico_objectives[\"modes\"] = [\"min\", \"min\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Search space"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"search_space = {\n",
" \"n_layers\": tune.randint(lower=1, upper=3),\n",
" \"n_units_l0\": tune.randint(lower=4, upper=128),\n",
" \"n_units_l1\": tune.randint(lower=4, upper=128),\n",
" \"n_units_l2\": tune.randint(lower=4, upper=128),\n",
" \"dropout_0\": tune.uniform(lower=0.2, upper=0.5),\n",
" \"dropout_1\": tune.uniform(lower=0.2, upper=0.5),\n",
" \"dropout_2\": tune.uniform(lower=0.2, upper=0.5),\n",
" \"lr\": tune.loguniform(lower=1e-5, upper=1e-1),\n",
" \"n_epoch\": tune.randint(lower=1, upper=20),\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Launch the tuning"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"low_cost_partial_config = {\n",
" \"n_layers\": 1,\n",
" \"n_units_l0\": 4,\n",
" \"n_units_l1\": 4,\n",
" \"n_units_l2\": 4,\n",
" \"n_epoch\": 1,\n",
"}\n",
"\n",
"analysis = tune.run(\n",
" evaluate_function,\n",
" num_samples=-1,\n",
" time_budget_s=100,\n",
" config=search_space,\n",
" use_ray=False,\n",
" lexico_objectives=lexico_objectives,\n",
" low_cost_partial_config=low_cost_partial_config,\n",
")\n",
"result = analysis.best_result\n",
"print(result)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.14 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.14"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,395 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pytorch model tuning example on CIFAR10\n",
"This notebook uses flaml to tune a pytorch model on CIFAR10. It is modified based on [this example](https://docs.ray.io/en/master/tune/examples/cifar10_pytorch.html).\n",
"\n",
"**Requirements.** This notebook requires:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"%pip install torchvision flaml[blendsearch,ray]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Network Specification"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as optim\n",
"from torch.utils.data import random_split\n",
"import torchvision\n",
"import torchvision.transforms as transforms\n",
"\n",
"\n",
"class Net(nn.Module):\n",
"\n",
" def __init__(self, l1=120, l2=84):\n",
" super(Net, self).__init__()\n",
" self.conv1 = nn.Conv2d(3, 6, 5)\n",
" self.pool = nn.MaxPool2d(2, 2)\n",
" self.conv2 = nn.Conv2d(6, 16, 5)\n",
" self.fc1 = nn.Linear(16 * 5 * 5, l1)\n",
" self.fc2 = nn.Linear(l1, l2)\n",
" self.fc3 = nn.Linear(l2, 10)\n",
"\n",
" def forward(self, x):\n",
" x = self.pool(F.relu(self.conv1(x)))\n",
" x = self.pool(F.relu(self.conv2(x)))\n",
" x = x.view(-1, 16 * 5 * 5)\n",
" x = F.relu(self.fc1(x))\n",
" x = F.relu(self.fc2(x))\n",
" x = self.fc3(x)\n",
" return x"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def load_data(data_dir=\"data\"):\n",
" transform = transforms.Compose([\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n",
" ])\n",
"\n",
" trainset = torchvision.datasets.CIFAR10(\n",
" root=data_dir, train=True, download=True, transform=transform)\n",
"\n",
" testset = torchvision.datasets.CIFAR10(\n",
" root=data_dir, train=False, download=True, transform=transform)\n",
"\n",
" return trainset, testset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from ray import tune\n",
"\n",
"def train_cifar(config, checkpoint_dir=None, data_dir=None):\n",
" if \"l1\" not in config:\n",
" logger.warning(config)\n",
" net = Net(2**config[\"l1\"], 2**config[\"l2\"])\n",
"\n",
" device = \"cpu\"\n",
" if torch.cuda.is_available():\n",
" device = \"cuda:0\"\n",
" if torch.cuda.device_count() > 1:\n",
" net = nn.DataParallel(net)\n",
" net.to(device)\n",
"\n",
" criterion = nn.CrossEntropyLoss()\n",
" optimizer = optim.SGD(net.parameters(), lr=config[\"lr\"], momentum=0.9)\n",
"\n",
" # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint\n",
" # should be restored.\n",
" if checkpoint_dir:\n",
" checkpoint = os.path.join(checkpoint_dir, \"checkpoint\")\n",
" model_state, optimizer_state = torch.load(checkpoint)\n",
" net.load_state_dict(model_state)\n",
" optimizer.load_state_dict(optimizer_state)\n",
"\n",
" trainset, testset = load_data(data_dir)\n",
"\n",
" test_abs = int(len(trainset) * 0.8)\n",
" train_subset, val_subset = random_split(\n",
" trainset, [test_abs, len(trainset) - test_abs])\n",
"\n",
" trainloader = torch.utils.data.DataLoader(\n",
" train_subset,\n",
" batch_size=int(2**config[\"batch_size\"]),\n",
" shuffle=True,\n",
" num_workers=4)\n",
" valloader = torch.utils.data.DataLoader(\n",
" val_subset,\n",
" batch_size=int(2**config[\"batch_size\"]),\n",
" shuffle=True,\n",
" num_workers=4)\n",
"\n",
" for epoch in range(int(round(config[\"num_epochs\"]))): # loop over the dataset multiple times\n",
" running_loss = 0.0\n",
" epoch_steps = 0\n",
" for i, data in enumerate(trainloader, 0):\n",
" # get the inputs; data is a list of [inputs, labels]\n",
" inputs, labels = data\n",
" inputs, labels = inputs.to(device), labels.to(device)\n",
"\n",
" # zero the parameter gradients\n",
" optimizer.zero_grad()\n",
"\n",
" # forward + backward + optimize\n",
" outputs = net(inputs)\n",
" loss = criterion(outputs, labels)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" # print statistics\n",
" running_loss += loss.item()\n",
" epoch_steps += 1\n",
" if i % 2000 == 1999: # print every 2000 mini-batches\n",
" print(\"[%d, %5d] loss: %.3f\" % (epoch + 1, i + 1,\n",
" running_loss / epoch_steps))\n",
" running_loss = 0.0\n",
"\n",
" # Validation loss\n",
" val_loss = 0.0\n",
" val_steps = 0\n",
" total = 0\n",
" correct = 0\n",
" for i, data in enumerate(valloader, 0):\n",
" with torch.no_grad():\n",
" inputs, labels = data\n",
" inputs, labels = inputs.to(device), labels.to(device)\n",
"\n",
" outputs = net(inputs)\n",
" _, predicted = torch.max(outputs.data, 1)\n",
" total += labels.size(0)\n",
" correct += (predicted == labels).sum().item()\n",
"\n",
" loss = criterion(outputs, labels)\n",
" val_loss += loss.cpu().numpy()\n",
" val_steps += 1\n",
"\n",
" # Here we save a checkpoint. It is automatically registered with\n",
" # Ray Tune and will potentially be passed as the `checkpoint_dir`\n",
" # parameter in future iterations.\n",
" with tune.checkpoint_dir(step=epoch) as checkpoint_dir:\n",
" path = os.path.join(checkpoint_dir, \"checkpoint\")\n",
" torch.save(\n",
" (net.state_dict(), optimizer.state_dict()), path)\n",
"\n",
" tune.report(loss=(val_loss / val_steps), accuracy=correct / total)\n",
" print(\"Finished Training\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test Accuracy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def _test_accuracy(net, device=\"cpu\"):\n",
" trainset, testset = load_data()\n",
"\n",
" testloader = torch.utils.data.DataLoader(\n",
" testset, batch_size=4, shuffle=False, num_workers=2)\n",
"\n",
" correct = 0\n",
" total = 0\n",
" with torch.no_grad():\n",
" for data in testloader:\n",
" images, labels = data\n",
" images, labels = images.to(device), labels.to(device)\n",
" outputs = net(images)\n",
" _, predicted = torch.max(outputs.data, 1)\n",
" total += labels.size(0)\n",
" correct += (predicted == labels).sum().item()\n",
"\n",
" return correct / total"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hyperparameter Optimization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import flaml\n",
"import os\n",
"\n",
"data_dir = os.path.abspath(\"data\")\n",
"load_data(data_dir) # Download data for all trials before starting the run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Search space"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"max_num_epoch = 100\n",
"config = {\n",
" \"l1\": tune.randint(2, 9), # log transformed with base 2\n",
" \"l2\": tune.randint(2, 9), # log transformed with base 2\n",
" \"lr\": tune.loguniform(1e-4, 1e-1),\n",
" \"num_epochs\": tune.loguniform(1, max_num_epoch),\n",
" \"batch_size\": tune.randint(1, 5) # log transformed with base 2\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"time_budget_s = 3600 # time budget in seconds\n",
"gpus_per_trial = 0.5 # number of gpus for each trial; 0.5 means two training jobs can share one gpu\n",
"num_samples = 500 # maximal number of trials\n",
"np.random.seed(7654321)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Launch the tuning"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"start_time = time.time()\n",
"result = flaml.tune.run(\n",
" tune.with_parameters(train_cifar, data_dir=data_dir),\n",
" config=config,\n",
" metric=\"loss\",\n",
" mode=\"min\",\n",
" low_cost_partial_config={\"num_epochs\": 1},\n",
" max_resource=max_num_epoch,\n",
" min_resource=1,\n",
" scheduler=\"asha\", # need to use tune.report to report intermediate results in train_cifar \n",
" resources_per_trial={\"cpu\": 1, \"gpu\": gpus_per_trial},\n",
" local_dir='logs/',\n",
" num_samples=num_samples,\n",
" time_budget_s=time_budget_s,\n",
" use_ray=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"#trials={len(result.trials)}\")\n",
"print(f\"time={time.time()-start_time}\")\n",
"best_trial = result.get_best_trial(\"loss\", \"min\", \"all\")\n",
"print(\"Best trial config: {}\".format(best_trial.config))\n",
"print(\"Best trial final validation loss: {}\".format(\n",
" best_trial.metric_analysis[\"loss\"][\"min\"]))\n",
"print(\"Best trial final validation accuracy: {}\".format(\n",
" best_trial.metric_analysis[\"accuracy\"][\"max\"]))\n",
"\n",
"best_trained_model = Net(2**best_trial.config[\"l1\"],\n",
" 2**best_trial.config[\"l2\"])\n",
"device = \"cpu\"\n",
"if torch.cuda.is_available():\n",
" device = \"cuda:0\"\n",
" if gpus_per_trial > 1:\n",
" best_trained_model = nn.DataParallel(best_trained_model)\n",
"best_trained_model.to(device)\n",
"\n",
"checkpoint_value = (\n",
" getattr(best_trial.checkpoint, \"dir_or_data\", None)\n",
" or best_trial.checkpoint.value\n",
")\n",
"checkpoint_path = os.path.join(checkpoint_value, \"checkpoint\")\n",
"\n",
"model_state, optimizer_state = torch.load(checkpoint_path)\n",
"best_trained_model.load_state_dict(model_state)\n",
"\n",
"test_acc = _test_accuracy(best_trained_model, device)\n",
"print(\"Best trial test set accuracy: {}\".format(test_acc))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.11.0 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
},
"metadata": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
},
"vscode": {
"interpreter": {
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because it is too large Load Diff

View File

@ -1,618 +0,0 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/zeroshot_lightgbm.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Copyright (c) FLAML authors. All rights reserved. \n",
"\n",
"Licensed under the MIT License.\n",
"\n",
"# Zero-shot AutoML with FLAML\n",
"\n",
"\n",
"## Introduction\n",
"\n",
"In this notebook, we demonstrate a basic use case of zero-shot AutoML with FLAML.\n",
"\n",
"FLAML requires `Python>=3.7`. To run this notebook example, please install the [autozero] option:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# %pip install flaml[autozero] lightgbm openml;"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## What is zero-shot AutoML?\n",
"\n",
"Zero-shot automl means automl systems without expensive tuning. But it does adapt to data.\n",
"A zero-shot automl system will recommend a data-dependent default configuration for a given dataset.\n",
"\n",
"Think about what happens when you use a `LGBMRegressor`. When you initialize a `LGBMRegressor` without any argument, it will set all the hyperparameters to the default values preset by the lightgbm library.\n",
"There is no doubt that these default values have been carefully chosen by the library developers.\n",
"But they are static. They are not adaptive to different datasets.\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': 'warn', 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}\n"
]
}
],
"source": [
"from lightgbm import LGBMRegressor\n",
"estimator = LGBMRegressor()\n",
"print(estimator.get_params())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It is unlikely that 100 trees with 31 leaves each is the best hyperparameter setting for every dataset.\n",
"\n",
"So, we propose to recommend data-dependent default configurations at runtime. \n",
"All you need to do is to import the `LGBMRegressor` from flaml.default instead of from lightgbm.\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from flaml.default import LGBMRegressor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Other parts of code remain the same. The new `LGBMRegressor` will automatically choose a configuration according to the training data.\n",
"For different training data the configuration could be different.\n",
"The recommended configuration can be either the same as the static default configuration from the library, or different.\n",
"It is expected to be no worse than the static default configuration in most cases.\n",
"\n",
"For example, let's download [houses dataset](https://www.openml.org/d/537) from OpenML. The task is to predict median price of the house in the region based on demographic composition and a state of housing market in the region."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"slideshow": {
"slide_type": "subslide"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"download dataset from openml\n",
"Dataset name: houses\n",
"X_train.shape: (15480, 8), y_train.shape: (15480,);\n",
"X_test.shape: (5160, 8), y_test.shape: (5160,)\n"
]
}
],
"source": [
"from flaml.data import load_openml_dataset\n",
"X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir='./')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" median_income housing_median_age total_rooms total_bedrooms \\\n",
"19226 7.3003 19 4976.0 711.0 \n",
"14549 5.9547 18 1591.0 268.0 \n",
"9093 3.2125 19 552.0 129.0 \n",
"12213 6.9930 13 270.0 42.0 \n",
"12765 2.5162 21 3260.0 763.0 \n",
"... ... ... ... ... \n",
"13123 4.4125 20 1314.0 229.0 \n",
"19648 2.9135 27 1118.0 195.0 \n",
"9845 3.1977 31 1431.0 370.0 \n",
"10799 5.6315 34 2125.0 498.0 \n",
"2732 1.3882 15 1171.0 328.0 \n",
"\n",
" population households latitude longitude \n",
"19226 1926.0 625.0 38.46 -122.68 \n",
"14549 547.0 243.0 32.95 -117.24 \n",
"9093 314.0 106.0 34.68 -118.27 \n",
"12213 120.0 42.0 33.51 -117.18 \n",
"12765 1735.0 736.0 38.62 -121.41 \n",
"... ... ... ... ... \n",
"13123 712.0 219.0 38.27 -121.26 \n",
"19648 647.0 209.0 37.48 -120.89 \n",
"9845 704.0 393.0 36.58 -121.90 \n",
"10799 1052.0 468.0 33.62 -117.93 \n",
"2732 1024.0 298.0 32.80 -115.56 \n",
"\n",
"[15480 rows x 8 columns]\n"
]
}
],
"source": [
"print(X_train)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"We fit the `flaml.default.LGBMRegressor` on this dataset."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:flaml.default.suggest:metafeature distance: 0.02197989436019765\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.7019911744574896, 'importance_type': 'split', 'learning_rate': 0.022635758411078528, 'max_depth': -1, 'min_child_samples': 2, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 4797, 'n_jobs': -1, 'num_leaves': 122, 'objective': None, 'random_state': None, 'reg_alpha': 0.004252223402511765, 'reg_lambda': 0.11288241427227624, 'silent': 'warn', 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'max_bin': 511, 'verbose': -1}\n"
]
}
],
"source": [
"estimator = LGBMRegressor() # imported from flaml.default\n",
"estimator.fit(X_train, y_train)\n",
"print(estimator.get_params())"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"The configuration is adapted as shown here. \n",
"The number of trees is 4797, the number of leaves is 122.\n",
"Does it work better than the static default configuration?\n",
"Lets compare.\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"0.8537444671194614"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"estimator.score(X_test, y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The data-dependent configuration has a $r^2$ metric 0.8537 on the test data. What about static default configuration from lightgbm?"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.8296179648694404"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from lightgbm import LGBMRegressor\n",
"estimator = LGBMRegressor()\n",
"estimator.fit(X_train, y_train)\n",
"estimator.score(X_test, y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The static default configuration gets $r^2=0.8296$, much lower than 0.8537 by the data-dependent configuration using `flaml.default`.\n",
"Again, the only difference in the code is from where you import the `LGBMRegressor`.\n",
"The adaptation to the training dataset is under the hood.\n",
"\n",
"You might wonder, how is it possible to find the data-dependent configuration without tuning?\n",
"The answer is that,\n",
"flaml can recommend good data-dependent default configurations at runtime without tuning only because it mines the hyperparameter configurations across different datasets offline as a preparation step.\n",
"So basically, zero-shot automl shifts the tuning cost from online to offline.\n",
"In the offline preparation stage, we applied `flaml.AutoML`.\n",
"\n",
"### Benefit of zero-shot AutoML\n",
"Now, what is the benefit of zero-shot automl? Or what is the benefit of shifting tuning from online to offline?\n",
"The first benefit is the online computational cost. That is the cost paid by the final consumers of automl. They only need to train one model.\n",
"They get the hyperparameter configuration right away. There is no overhead to worry about.\n",
"Another big benefit is that your code doesnt need to change. So if you currently have a workflow without the setup for tuning, you can use zero-shot automl without breaking that workflow.\n",
"Compared to tuning-based automl, zero-shot automl requires less input. For example, it doesnt need a tuning budget, resampling strategy, validation dataset etc.\n",
"A related benefit is that you dont need to worry about holding a subset of the training data for validation, which the tuning process might overfit.\n",
"As there is no tuning, you can use all the training data to train your model.\n",
"Finally, you can customize the offline preparation for a domain, and leverage the past tuning experience for better adaptation to similar tasks.\n",
"\n",
"## How to use at runtime\n",
"The easiest way to leverage this technique is to import a \"flamlized\" learner of your favorite choice and use it just as how you use the learner before. \n",
"The automation is done behind the scene.\n",
"The current list of “flamlized” learners are:\n",
"* LGBMClassifier, LGBMRegressor (inheriting LGBMClassifier, LGBMRegressor from lightgbm)\n",
"* XGBClassifier, XGBRegressor (inheriting LGBMClassifier, LGBMRegressor from xgboost)\n",
"* RandomForestClassifier, RandomForestRegressor (inheriting from scikit-learn)\n",
"* ExtraTreesClassifier, ExtraTreesRegressor (inheriting from scikit-learn)\n",
"They work for classification or regression tasks.\n",
"\n",
"### What's the magic behind the scene?\n",
"`flaml.default.LGBMRegressor` inherits `lightgbm.LGBMRegressor`, so all the methods and attributes in `lightgbm.LGBMRegressor` are still valid in `flaml.default.LGBMRegressor`.\n",
"The difference is, `flaml.default.LGBMRegressor` decides the hyperparameter configurations based on the training data. It would use a different configuration if it is predicted to outperform the original data-independent default. If you inspect the params of the fitted estimator, you can find what configuration is used. If the original default configuration is used, then it is equivalent to the original estimator.\n",
"The recommendation of which configuration should be used is based on offline AutoML run results. Information about the training dataset, such as the size of the dataset will be used to recommend a data-dependent configuration. The recommendation is done instantly in negligible time. The training can be faster or slower than using the original default configuration depending on the recommended configuration. \n",
"\n",
"### Can I check the configuration before training?\n",
"Yes. You can use `suggest_hyperparams()` method to find the suggested configuration.\n",
"For example, when you run the following code with the houses dataset, it will return the hyperparameter configuration instantly, without training the model."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:flaml.default.suggest:metafeature distance: 0.02197989436019765\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'n_estimators': 4797, 'num_leaves': 122, 'min_child_samples': 2, 'learning_rate': 0.022635758411078528, 'colsample_bytree': 0.7019911744574896, 'reg_alpha': 0.004252223402511765, 'reg_lambda': 0.11288241427227624, 'max_bin': 511, 'verbose': -1}\n"
]
}
],
"source": [
"from flaml.default import LGBMRegressor\n",
"\n",
"estimator = LGBMRegressor()\n",
"hyperparams, _, _, _ = estimator.suggest_hyperparams(X_train, y_train)\n",
"print(hyperparams)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can print the configuration as a dictionary, in case you want to check it before you use it for training.\n",
"\n",
"This brings up an equivalent, open-box way for zero-shot AutoML if you would like more control over the training. \n",
"Import the function `preprocess_and_suggest_hyperparams` from `flaml.default`.\n",
"This function takes the task name, the training dataset, and the estimator name as input:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:flaml.default.suggest:metafeature distance: 0.02197989436019765\n"
]
}
],
"source": [
"from flaml.default import preprocess_and_suggest_hyperparams\n",
"(\n",
" hyperparams,\n",
" estimator_class,\n",
" X_transformed,\n",
" y_transformed,\n",
" feature_transformer,\n",
" label_transformer,\n",
") = preprocess_and_suggest_hyperparams(\"regression\", X_train, y_train, \"lgbm\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It outputs the hyperparameter configurations, estimator class, transformed data, feature transformer and label transformer.\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'lightgbm.sklearn.LGBMRegressor'>\n"
]
}
],
"source": [
"print(estimator_class)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this case, the estimator name is “lgbm”. The corresponding estimator class is `lightgbm.LGBMRegressor`.\n",
"This line initializes a LGBMClassifier with the recommended hyperparameter configuration:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"model = estimator_class(**hyperparams)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Then we can fit the model on the transformed data."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LGBMRegressor(colsample_bytree=0.7019911744574896,\n",
" learning_rate=0.022635758411078528, max_bin=511,\n",
" min_child_samples=2, n_estimators=4797, num_leaves=122,\n",
" reg_alpha=0.004252223402511765, reg_lambda=0.11288241427227624,\n",
" verbose=-1)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LGBMRegressor</label><div class=\"sk-toggleable__content\"><pre>LGBMRegressor(colsample_bytree=0.7019911744574896,\n",
" learning_rate=0.022635758411078528, max_bin=511,\n",
" min_child_samples=2, n_estimators=4797, num_leaves=122,\n",
" reg_alpha=0.004252223402511765, reg_lambda=0.11288241427227624,\n",
" verbose=-1)</pre></div></div></div></div></div>"
],
"text/plain": [
"LGBMRegressor(colsample_bytree=0.7019911744574896,\n",
" learning_rate=0.022635758411078528, max_bin=511,\n",
" min_child_samples=2, n_estimators=4797, num_leaves=122,\n",
" reg_alpha=0.004252223402511765, reg_lambda=0.11288241427227624,\n",
" verbose=-1)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.fit(X_transformed, y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The feature transformer needs to be applied to the test data before prediction."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"X_test_transformed = feature_transformer.transform(X_test)\n",
"y_pred = model.predict(X_test_transformed)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"These are automated when you use the \"flamlized\" learner. So you dont need to know these details when you dont need to open the box.\n",
"We demonstrate them here to help you understand whats going on. And in case you need to modify some steps, you know what to do.\n",
"\n",
"(Note that some classifiers like XGBClassifier require the labels to be integers, while others do not. So you can decide whether to use the transformed labels y_transformed and the label transformer label_transformer. Also, each estimator may require specific preprocessing of the data.)\n",
"\n",
"## Combine Zero-shot AutoML and HPO\n",
"\n",
"Zero Shot AutoML is fast and simple to use. It is very useful if speed and simplicity are the primary concerns. \n",
"If you are not satisfied with the accuracy of the zero shot model, you may want to spend extra time to tune the model.\n",
"You can use `flaml.AutoML` to do that. Everything is the same as your normal `AutoML.fit()`, except to set `starting_points=\"data\"`.\n",
"This tells AutoML to start the tuning from the data-dependent default configurations. You can set the tuning budget in the same way as before.\n",
"Note that if you set `max_iter=0` and `time_budget=None`, you are effectively using zero-shot AutoML. \n",
"When `estimator_list` is omitted, the most promising estimator together with its hyperparameter configuration will be tried first, which are both decided by zero-shot automl."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[flaml.automl.logger: 04-28 02:51:45] {1663} INFO - task = regression\n",
"[flaml.automl.logger: 04-28 02:51:45] {1670} INFO - Data split method: uniform\n",
"[flaml.automl.logger: 04-28 02:51:45] {1673} INFO - Evaluation method: cv\n",
"[flaml.automl.logger: 04-28 02:51:45] {1771} INFO - Minimizing error metric: 1-r2\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:flaml.default.suggest:metafeature distance: 0.02197989436019765\n",
"INFO:flaml.default.suggest:metafeature distance: 0.006677018633540373\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[flaml.automl.logger: 04-28 02:51:45] {1881} INFO - List of ML learners in AutoML Run: ['lgbm']\n",
"[flaml.automl.logger: 04-28 02:51:45] {2191} INFO - iteration 0, current learner lgbm\n",
"[flaml.automl.logger: 04-28 02:53:39] {2317} INFO - Estimated sufficient time budget=1134156s. Estimated necessary time budget=1134s.\n",
"[flaml.automl.logger: 04-28 02:53:39] {2364} INFO - at 113.5s,\testimator lgbm's best error=0.1513,\tbest estimator lgbm's best error=0.1513\n",
"[flaml.automl.logger: 04-28 02:53:39] {2191} INFO - iteration 1, current learner lgbm\n",
"[flaml.automl.logger: 04-28 02:55:32] {2364} INFO - at 226.6s,\testimator lgbm's best error=0.1513,\tbest estimator lgbm's best error=0.1513\n",
"[flaml.automl.logger: 04-28 02:55:54] {2600} INFO - retrain lgbm for 22.3s\n",
"[flaml.automl.logger: 04-28 02:55:54] {2603} INFO - retrained model: LGBMRegressor(colsample_bytree=0.7019911744574896,\n",
" learning_rate=0.02263575841107852, max_bin=511,\n",
" min_child_samples=2, n_estimators=4797, num_leaves=122,\n",
" reg_alpha=0.004252223402511765, reg_lambda=0.11288241427227624,\n",
" verbose=-1)\n",
"[flaml.automl.logger: 04-28 02:55:54] {1911} INFO - fit succeeded\n",
"[flaml.automl.logger: 04-28 02:55:54] {1912} INFO - Time taken to find the best model: 113.4601559638977\n"
]
}
],
"source": [
"from flaml import AutoML\n",
"\n",
"automl = AutoML()\n",
"settings = {\n",
" \"task\": \"regression\",\n",
" \"starting_points\": \"data\",\n",
" \"estimator_list\": [\"lgbm\"],\n",
" \"time_budget\": 300,\n",
"}\n",
"automl.fit(X_train, y_train, **settings)"
]
}
],
"metadata": {
"interpreter": {
"hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
},
"kernelspec": {
"display_name": "Python 3.9.9 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

127
setup.py
View File

@ -9,158 +9,57 @@ with open("README.md", "r", encoding="UTF-8") as fh:
# Get the code version
version = {}
with open(os.path.join(here, "flaml/version.py")) as fp:
with open(os.path.join(here, "autogen/version.py")) as fp:
exec(fp.read(), version)
__version__ = version["__version__"]
install_requires = [
"NumPy>=1.17.0rc1",
"openai",
"diskcache",
"termcolor",
]
setuptools.setup(
name="FLAML",
name="AutoGen",
version=__version__,
author="Microsoft Corporation",
author_email="hpo@microsoft.com",
description="A fast library for automated machine learning and tuning",
author="AutoGen",
author_email="autogen@gmail.com",
description="Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/microsoft/FLAML",
packages=setuptools.find_packages(include=["flaml*"]),
url="https://github.com/microsoft/autogen",
packages=setuptools.find_packages(include=["autogen*"]),
package_data={
"flaml.default": ["*/*.json"],
"autogen.default": ["*/*.json"],
},
include_package_data=True,
install_requires=install_requires,
extras_require={
"automl": [
"lightgbm>=2.3.1",
"xgboost>=0.90",
"scipy>=1.4.1",
"pandas>=1.1.4",
"scikit-learn>=0.24",
],
"notebook": [
"jupyter",
],
"spark": [
"pyspark>=3.2.0",
"joblibspark>=0.5.0",
"joblib<1.3.0", # temp solution for joblib 1.3.0 issue, no need once https://github.com/joblib/joblib-spark/pull/48 is merged
],
"test": [
"lightgbm>=2.3.1",
"xgboost>=0.90",
"scipy>=1.4.1",
"pandas>=1.1.4",
"scikit-learn>=0.24",
"thop",
"pytest>=6.1.1",
"coverage>=5.3",
"pre-commit",
"torch",
"torchvision",
"catboost>=0.26,<1.2",
"rgf-python",
"optuna==2.8.0",
"openml",
"statsmodels>=0.12.2",
"psutil==5.8.0",
"dataclasses",
"transformers[torch]==4.26",
"datasets",
"nltk",
"rouge_score",
"hcrystalball==0.1.10",
"seqeval",
"pytorch-forecasting>=0.9.0,<=0.10.1",
"mlflow",
"pyspark>=3.2.0",
"joblibspark>=0.5.0",
"nbconvert",
"nbformat",
"ipykernel",
"pytorch-lightning<1.9.1", # test_forecast_panel
"tensorboardX==2.6", # test_forecast_panel
"requests<2.29.0", # https://github.com/docker/docker-py/issues/3113
"packaging",
"pydantic==1.10.9",
"sympy",
"wolframalpha",
"joblib<1.3.0", # temp solution for joblib 1.3.0 issue, no need once https://github.com/joblib/joblib-spark/pull/48 is merged
],
"catboost": ["catboost>=0.26"],
"blendsearch": [
"optuna==2.8.0",
"packaging",
],
"ray": [
"ray[tune]~=1.13",
],
"azureml": [
"azureml-mlflow",
],
"nni": [
"nni",
],
"vw": [
"vowpalwabbit>=8.10.0, <9.0.0",
"scikit-learn",
],
"hf": [
"transformers[torch]==4.26",
"datasets",
"nltk",
"rouge_score",
"seqeval",
],
"nlp": [ # for backward compatibility; hf is the new option name
"transformers[torch]==4.26",
"datasets",
"nltk",
"rouge_score",
"seqeval",
],
"ts_forecast": [
"holidays<0.14", # to prevent installation error for prophet
"prophet>=1.0.1",
"statsmodels>=0.12.2",
"hcrystalball==0.1.10",
],
"forecast": [
"holidays<0.14", # to prevent installation error for prophet
"prophet>=1.0.1",
"statsmodels>=0.12.2",
"hcrystalball==0.1.10",
"pytorch-forecasting>=0.9.0",
"pytorch-lightning==1.9.0",
"tensorboardX==2.6",
],
"benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3", "pandas==1.1.4"],
"openai": ["openai==0.27.8", "diskcache"],
"autogen": ["openai==0.27.8", "diskcache", "termcolor"],
"mathchat": ["openai==0.27.8", "diskcache", "termcolor", "sympy", "pydantic==1.10.9", "wolframalpha"],
"mathchat": ["sympy", "pydantic==1.10.9", "wolframalpha"],
"retrievechat": [
"openai==0.27.8",
"diskcache",
"termcolor",
"chromadb",
"tiktoken",
"sentence_transformers",
],
"synapse": [
"joblibspark>=0.5.0",
"optuna==2.8.0",
"pyspark>=3.2.0",
"joblib<1.3.0", # temp solution for joblib 1.3.0 issue, no need once https://github.com/joblib/joblib-spark/pull/48 is merged
],
"autozero": ["scikit-learn", "pandas", "packaging"],
},
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires=">=3.6",
python_requires=">=3.8",
)

View File

@ -1,14 +0,0 @@
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04
RUN pip install azureml-core
RUN pip install flaml[blendsearch,ray]
RUN pip install ray-on-aml
EXPOSE 8265
EXPOSE 6379
USER root
RUN apt-get update
RUN apt-get install -y jq
RUN apt-get install -y rsync

Some files were not shown because too many files have changed in this diff Show More