Merge branch 'main' into LexiFlow

This commit is contained in:
Chi Wang 2022-10-14 11:04:18 -07:00 committed by GitHub
commit cafb67123a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 248 additions and 119 deletions

View File

@ -4,6 +4,8 @@
![Python Version](https://img.shields.io/badge/3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)
[![Downloads](https://pepy.tech/badge/flaml)](https://pepy.tech/project/flaml)
[![Join the chat at https://gitter.im/FLAMLer/community](https://badges.gitter.im/FLAMLer/community.svg)](https://gitter.im/FLAMLer/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![](https://img.shields.io/discord/1025786666260111483?logo=discord&style=flat)](https://discord.gg/Cppx2vSPVP)
# A Fast Library for Automated Machine Learning & Tuning

View File

@ -1,64 +0,0 @@
from collections import OrderedDict
import transformers
if transformers.__version__.startswith("3"):
from transformers.modeling_electra import ElectraClassificationHead
from transformers.modeling_roberta import RobertaClassificationHead
else:
from transformers.models.electra.modeling_electra import ElectraClassificationHead
from transformers.models.roberta.modeling_roberta import RobertaClassificationHead
MODEL_CLASSIFICATION_HEAD_MAPPING = OrderedDict(
[
("electra", ElectraClassificationHead),
("roberta", RobertaClassificationHead),
]
)
class AutoSeqClassificationHead:
"""
This is a class for getting classification head class based on the name of the LM
instantiated as one of the ClassificationHead classes of the library when
created with the `AutoSeqClassificationHead.from_model_type_and_config` method.
This class cannot be instantiated directly using ``__init__()`` (throws an error).
"""
def __init__(self):
raise EnvironmentError(
"AutoSeqClassificationHead is designed to be instantiated "
"using the `AutoSeqClassificationHead.from_model_type_and_config(cls, model_type, config)` methods."
)
@classmethod
def from_model_type_and_config(
cls, model_type: str, config: transformers.PretrainedConfig
):
"""
Instantiate one of the classification head classes from the mode_type and model configuration.
Args:
model_type: A string, which desribes the model type, e.g., "electra".
config: The huggingface class of the model's configuration.
Example:
```python
from transformers import AutoConfig
model_config = AutoConfig.from_pretrained("google/electra-base-discriminator")
AutoSeqClassificationHead.from_model_type_and_config("electra", model_config)
```
"""
if model_type in MODEL_CLASSIFICATION_HEAD_MAPPING.keys():
return MODEL_CLASSIFICATION_HEAD_MAPPING[model_type](config)
raise ValueError(
"Unrecognized configuration class {} for class {}.\n"
"Model type should be one of {}.".format(
config.__class__,
cls.__name__,
", ".join(MODEL_CLASSIFICATION_HEAD_MAPPING.keys()),
)
)

View File

@ -404,10 +404,6 @@ def load_model(checkpoint_path, task, num_labels=None):
transformers.logging.set_verbosity_error()
from transformers import AutoConfig
from ..huggingface.switch_head_auto import (
AutoSeqClassificationHead,
MODEL_CLASSIFICATION_HEAD_MAPPING,
)
from ...data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION
def get_this_model(checkpoint_path, task, model_config):
@ -418,7 +414,7 @@ def load_model(checkpoint_path, task, num_labels=None):
if task in (SEQCLASSIFICATION, SEQREGRESSION):
return AutoModelForSequenceClassification.from_pretrained(
checkpoint_path, config=model_config
checkpoint_path, config=model_config, ignore_mismatched_sizes=True
)
elif task == TOKENCLASSIFICATION:
return AutoModelForTokenClassification.from_pretrained(
@ -433,9 +429,6 @@ def load_model(checkpoint_path, task, num_labels=None):
checkpoint_path, config=model_config
)
def is_pretrained_model_in_classification_head_list(model_type):
return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING
def _set_model_config(checkpoint_path):
if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):
model_config = AutoConfig.from_pretrained(
@ -448,40 +441,11 @@ def load_model(checkpoint_path, task, num_labels=None):
return model_config
current_config = AutoConfig.from_pretrained(checkpoint_path)
this_model_type, this_vocab_size = (
current_config.model_type,
current_config.vocab_size,
)
this_vocab_size = current_config.vocab_size
if task == SEQCLASSIFICATION:
num_labels_old = current_config.num_labels
if is_pretrained_model_in_classification_head_list(this_model_type):
model_config_num_labels = num_labels_old
else:
model_config_num_labels = num_labels
new_config = _set_model_config(checkpoint_path)
if is_pretrained_model_in_classification_head_list(this_model_type):
if num_labels != num_labels_old:
this_model = get_this_model(checkpoint_path, task, new_config)
new_config.num_labels = num_labels
this_model.num_labels = num_labels
this_model.classifier = (
AutoSeqClassificationHead.from_model_type_and_config(
this_model_type, new_config
)
)
else:
this_model = get_this_model(checkpoint_path, task, new_config)
else:
this_model = get_this_model(checkpoint_path, task, new_config)
this_model.resize_token_embeddings(this_vocab_size)
return this_model
else:
if task == SEQREGRESSION:
model_config_num_labels = 1
elif task == TOKENCLASSIFICATION:
model_config_num_labels = num_labels
model_config = _set_model_config(checkpoint_path)
this_model = get_this_model(checkpoint_path, task, model_config)
return this_model

View File

@ -2,6 +2,8 @@ import sys
import pytest
import requests
from utils import get_toy_data_seqclassification, get_automl_settings
import os
import shutil
@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@ -71,6 +73,9 @@ def test_hf_data():
del automl
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
if __name__ == "__main__":
test_hf_data()

View File

@ -1,14 +1,105 @@
from utils import get_toy_data_multiclassclassification, get_automl_settings
from utils import (
get_toy_data_regression,
get_toy_data_binclassification,
get_toy_data_multiclassclassification,
get_automl_settings,
)
import sys
import pytest
import os
import shutil
data_list = [
"get_toy_data_regression",
"get_toy_data_binclassification",
"get_toy_data_multiclassclassification",
]
model_path_list = [
"textattack/bert-base-uncased-STS-B",
"textattack/bert-base-uncased-SST-2",
"textattack/bert-base-uncased-MNLI",
]
def test_classification_head():
def test_switch_1_1():
data_idx, model_path_idx = 0, 0
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
def test_switch_1_2():
data_idx, model_path_idx = 0, 1
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
def test_switch_1_3():
data_idx, model_path_idx = 0, 2
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
def test_switch_2_1():
data_idx, model_path_idx = 1, 0
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
def test_switch_2_2():
data_idx, model_path_idx = 1, 1
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
def test_switch_2_3():
data_idx, model_path_idx = 1, 2
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
def test_switch_3_1():
data_idx, model_path_idx = 2, 0
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
def test_switch_3_2():
data_idx, model_path_idx = 2, 1
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
def test_switch_3_3():
data_idx, model_path_idx = 2, 2
_test_switch_classificationhead(
data_list[data_idx], model_path_list[model_path_idx]
)
def _test_switch_classificationhead(each_data, each_model_path):
from flaml import AutoML
import requests
X_train, y_train, X_val, y_val = get_toy_data_multiclassclassification()
automl = AutoML()
X_train, y_train, X_val, y_val = globals()[each_data]()
automl_settings = get_automl_settings()
automl_settings["model_path"] = each_model_path
if each_data == "get_toy_data_regression":
automl_settings["task"] = "seq-regression"
automl_settings["metric"] = "pearsonr"
else:
automl_settings["task"] = "seq-classification"
automl_settings["metric"] = "accuracy"
try:
automl.fit(
@ -21,6 +112,9 @@ def test_classification_head():
except requests.exceptions.HTTPError:
return
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
if __name__ == "__main__":
test_classification_head()
_test_switch_classificationhead(data_list[0], model_path_list[0])

View File

@ -1,6 +1,8 @@
import sys
import pytest
from utils import get_toy_data_seqclassification, get_automl_settings
import os
import shutil
def custom_metric(
@ -81,6 +83,9 @@ def test_custom_metric():
del automl
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
if __name__ == "__main__":
test_custom_metric()

View File

@ -1,6 +1,8 @@
import sys
import pytest
from utils import get_toy_data_seqclassification, get_automl_settings
import os
import shutil
@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@ -19,6 +21,9 @@ def test_cv():
except requests.exceptions.HTTPError:
return
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
if __name__ == "__main__":
test_cv()

View File

@ -1,6 +1,8 @@
import sys
import pytest
from utils import get_toy_data_multiplechoiceclassification, get_automl_settings
import os
import shutil
@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@ -46,6 +48,9 @@ def test_mcc():
accuracy = round(true_count / len(y_pred), 5)
print("Accuracy: " + str(accuracy))
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
if __name__ == "__main__":
test_mcc()

View File

@ -1,6 +1,8 @@
import sys
import pytest
from utils import get_toy_data_seqregression, get_automl_settings
import os
import shutil
@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@ -32,6 +34,9 @@ def test_regression():
)
automl.predict(X_val)
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
if __name__ == "__main__":
test_regression()

View File

@ -2,6 +2,8 @@ import sys
import pytest
import requests
from utils import get_toy_data_summarization, get_automl_settings
import os
import shutil
@pytest.mark.skipif(
@ -48,6 +50,9 @@ def test_summarization():
)
automl.predict(X_test)
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
if __name__ == "__main__":
test_summarization()

View File

@ -1,6 +1,8 @@
import sys
import pytest
import requests
import os
import shutil
from utils import (
get_toy_data_tokenclassification_idlabel,
get_toy_data_tokenclassification_tokenlabel,
@ -62,6 +64,9 @@ def test_tokenclassification_idlabel():
if min_inter_result != sys.maxsize:
assert val_loss == min_inter_result
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
@pytest.mark.skipif(
sys.platform == "darwin" or sys.version < "3.7",
@ -106,6 +111,9 @@ def test_tokenclassification_tokenlabel():
if min_inter_result != sys.maxsize:
assert val_loss == min_inter_result
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
if __name__ == "__main__":
test_tokenclassification_idlabel()

View File

@ -1,6 +1,8 @@
from utils import get_toy_data_seqclassification, get_automl_settings
import sys
from flaml.default import portfolio
import os
import shutil
def pop_args(fit_kwargs):
@ -80,6 +82,9 @@ def test_starting_point_not_in_search_space():
== "albert-base-v2"
)
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
def test_points_to_evaluate():
from flaml import AutoML
@ -99,6 +104,9 @@ def test_points_to_evaluate():
automl.fit(X_train, y_train, **automl_settings)
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
# TODO: implement _test_zero_shot_model
def test_zero_shot_nomodel():
@ -131,6 +139,9 @@ def test_zero_shot_nomodel():
pop_args(fit_kwargs)
model.fit(X_train, y_train, **fit_kwargs)
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")
def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"):
import os
@ -159,3 +170,9 @@ def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"):
)
except ValueError:
print("Feature not implemented")
import os
import shutil
if os.path.exists("test/data/output/"):
shutil.rmtree("test/data/output/")

View File

@ -70,23 +70,19 @@ def get_toy_data_seqclassification():
return X_train, y_train, X_val, y_val, X_test
def get_toy_data_multiclassclassification():
def get_toy_data_binclassification():
train_data = {
"text": [
"i didnt feel humiliated",
"i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
"im grabbing a minute to post i feel greedy wrong",
"i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
"i am feeling grouchy",
"ive been feeling a little burdened lately wasnt sure why that was",
"ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny",
"i feel as confused about life as a teenager or as jaded as a year old man",
"i have been with petronas for years i feel that petronas has performed well and made a huge profit",
"i feel romantic too",
"i feel like i have to make the suffering i m seeing mean something",
"i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
],
"label": [0, 0, 3, 2, 3, 0, 5, 4, 1, 2, 0, 1],
"label": [0, 0, 1, 0, 1, 1, 0, 1],
}
train_dataset = pd.DataFrame(train_data)
@ -95,9 +91,84 @@ def get_toy_data_multiclassclassification():
"i think it s the easiest time of year to feel dissatisfied",
"i feel low energy i m just thirsty",
"i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
"i do not feel reassured anxiety is on each side",
],
"label": [3, 0, 1, 1],
"label": [0, 1, 1],
}
dev_dataset = pd.DataFrame(dev_data)
custom_sent_keys = ["text"]
label_key = "label"
X_train = train_dataset[custom_sent_keys]
y_train = train_dataset[label_key]
X_val = dev_dataset[custom_sent_keys]
y_val = dev_dataset[label_key]
return X_train, y_train, X_val, y_val
def get_toy_data_regression():
train_data = {
"text": [
"i didnt feel humiliated",
"i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
"i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
"ive been feeling a little burdened lately wasnt sure why that was",
"i have been with petronas for years i feel that petronas has performed well and made a huge profit",
"i feel romantic too",
"i feel like i have to make the suffering i m seeing mean something",
"i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
],
"label": [1.0, 1.0, 3.0, 1.0, 5.0, 5.0, 1.0, 3.0],
}
train_dataset = pd.DataFrame(train_data)
dev_data = {
"text": [
"i think it s the easiest time of year to feel dissatisfied",
"i feel low energy i m just thirsty",
"i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
],
"label": [1.0, 3.0, 3.0],
}
dev_dataset = pd.DataFrame(dev_data)
custom_sent_keys = ["text"]
label_key = "label"
X_train = train_dataset[custom_sent_keys]
y_train = train_dataset[label_key]
X_val = dev_dataset[custom_sent_keys]
y_val = dev_dataset[label_key]
return X_train, y_train, X_val, y_val
def get_toy_data_multiclassclassification():
train_data = {
"text": [
"i didnt feel humiliated",
"i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
"i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
"ive been feeling a little burdened lately wasnt sure why that was",
"i have been with petronas for years i feel that petronas has performed well and made a huge profit",
"i feel romantic too",
"i feel like i have to make the suffering i m seeing mean something",
"i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
],
"label": [0, 0, 2, 0, 1, 2, 0, 1],
}
train_dataset = pd.DataFrame(train_data)
dev_data = {
"text": [
"i think it s the easiest time of year to feel dissatisfied",
"i feel low energy i m just thirsty",
"i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
],
"label": [0, 1, 1],
}
dev_dataset = pd.DataFrame(dev_data)

View File

@ -38,6 +38,13 @@ automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_
automl.predict(X_test)
```
Notice that after you run `automl.fit`, the intermediate checkpoints are saved under the specified output_dir `data/output`. You can use the following code to clean these outputs if they consume a large storage space:
```python
if os.path.exists("data/output/"):
shutil.rmtree("data/output/")
```
#### Sample output
```