Merge branch 'main' into LexiFlow

2022-10-14 11:04:18 -07:00 · 2022-10-14 11:04:18 -07:00 · cafb67123a
parent 2334a9c81d d3e0d1d852
commit cafb67123a
14 changed files with 248 additions and 119 deletions
--- a/README.md
+++ b/README.md
@ -4,6 +4,8 @@
 ![Python Version](https://img.shields.io/badge/3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)
 [![Downloads](https://pepy.tech/badge/flaml)](https://pepy.tech/project/flaml)
 [![Join the chat at https://gitter.im/FLAMLer/community](https://badges.gitter.im/FLAMLer/community.svg)](https://gitter.im/FLAMLer/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![](https://img.shields.io/discord/1025786666260111483?logo=discord&style=flat)](https://discord.gg/Cppx2vSPVP)
+

 # A Fast Library for Automated Machine Learning & Tuning

--- a/flaml/nlp/huggingface/switch_head_auto.py
+++ b/flaml/nlp/huggingface/switch_head_auto.py
@ -1,64 +0,0 @@
-from collections import OrderedDict
-
-import transformers
-
-if transformers.__version__.startswith("3"):
-    from transformers.modeling_electra import ElectraClassificationHead
-    from transformers.modeling_roberta import RobertaClassificationHead
-
-else:
-    from transformers.models.electra.modeling_electra import ElectraClassificationHead
-    from transformers.models.roberta.modeling_roberta import RobertaClassificationHead
-
-MODEL_CLASSIFICATION_HEAD_MAPPING = OrderedDict(
-    [
-        ("electra", ElectraClassificationHead),
-        ("roberta", RobertaClassificationHead),
-    ]
-)
-
-
-class AutoSeqClassificationHead:
-    """
-    This is a class for getting classification head class based on the name of the LM
-    instantiated as one of the ClassificationHead classes of the library when
-    created with the `AutoSeqClassificationHead.from_model_type_and_config` method.
-
-    This class cannot be instantiated directly using ``__init__()`` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "AutoSeqClassificationHead is designed to be instantiated "
-            "using the `AutoSeqClassificationHead.from_model_type_and_config(cls, model_type, config)` methods."
-        )
-
-    @classmethod
-    def from_model_type_and_config(
-        cls, model_type: str, config: transformers.PretrainedConfig
-    ):
-        """
-        Instantiate one of the classification head classes from the mode_type and model configuration.
-
-        Args:
-            model_type: A string, which desribes the model type, e.g., "electra".
-            config: The huggingface class of the model's configuration.
-
-        Example:
-
-        ```python
-        from transformers import AutoConfig
-        model_config = AutoConfig.from_pretrained("google/electra-base-discriminator")
-        AutoSeqClassificationHead.from_model_type_and_config("electra", model_config)
-        ```
-        """
-        if model_type in MODEL_CLASSIFICATION_HEAD_MAPPING.keys():
-            return MODEL_CLASSIFICATION_HEAD_MAPPING[model_type](config)
-        raise ValueError(
-            "Unrecognized configuration class {} for class {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(MODEL_CLASSIFICATION_HEAD_MAPPING.keys()),
-            )
-        )
--- a/flaml/nlp/huggingface/utils.py
+++ b/flaml/nlp/huggingface/utils.py
@ -404,10 +404,6 @@ def load_model(checkpoint_path, task, num_labels=None):
    transformers.logging.set_verbosity_error()

    from transformers import AutoConfig
-    from ..huggingface.switch_head_auto import (
-        AutoSeqClassificationHead,
-        MODEL_CLASSIFICATION_HEAD_MAPPING,
-    )
    from ...data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION

    def get_this_model(checkpoint_path, task, model_config):
@ -418,7 +414,7 @@ def load_model(checkpoint_path, task, num_labels=None):

        if task in (SEQCLASSIFICATION, SEQREGRESSION):
            return AutoModelForSequenceClassification.from_pretrained(
-                checkpoint_path, config=model_config
+                checkpoint_path, config=model_config, ignore_mismatched_sizes=True
            )
        elif task == TOKENCLASSIFICATION:
            return AutoModelForTokenClassification.from_pretrained(
@ -433,9 +429,6 @@ def load_model(checkpoint_path, task, num_labels=None):
                checkpoint_path, config=model_config
            )

-    def is_pretrained_model_in_classification_head_list(model_type):
-        return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING
-
    def _set_model_config(checkpoint_path):
        if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):
            model_config = AutoConfig.from_pretrained(
@ -448,40 +441,11 @@ def load_model(checkpoint_path, task, num_labels=None):
            return model_config

    current_config = AutoConfig.from_pretrained(checkpoint_path)
-    this_model_type, this_vocab_size = (
-        current_config.model_type,
-        current_config.vocab_size,
-    )
+    this_vocab_size = current_config.vocab_size

-    if task == SEQCLASSIFICATION:
-        num_labels_old = current_config.num_labels
-        if is_pretrained_model_in_classification_head_list(this_model_type):
-            model_config_num_labels = num_labels_old
-        else:
    model_config_num_labels = num_labels
    new_config = _set_model_config(checkpoint_path)

-        if is_pretrained_model_in_classification_head_list(this_model_type):
-            if num_labels != num_labels_old:
-                this_model = get_this_model(checkpoint_path, task, new_config)
-                new_config.num_labels = num_labels
-                this_model.num_labels = num_labels
-                this_model.classifier = (
-                    AutoSeqClassificationHead.from_model_type_and_config(
-                        this_model_type, new_config
-                    )
-                )
-            else:
-                this_model = get_this_model(checkpoint_path, task, new_config)
-        else:
    this_model = get_this_model(checkpoint_path, task, new_config)
    this_model.resize_token_embeddings(this_vocab_size)
    return this_model
-    else:
-        if task == SEQREGRESSION:
-            model_config_num_labels = 1
-        elif task == TOKENCLASSIFICATION:
-            model_config_num_labels = num_labels
-        model_config = _set_model_config(checkpoint_path)
-        this_model = get_this_model(checkpoint_path, task, model_config)
-        return this_model
--- a/test/nlp/test_autohf.py
+++ b/test/nlp/test_autohf.py
@ -2,6 +2,8 @@ import sys
 import pytest
 import requests
 from utils import get_toy_data_seqclassification, get_automl_settings
+import os
+import shutil


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@ -71,6 +73,9 @@ def test_hf_data():

    del automl

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 if __name__ == "__main__":
    test_hf_data()
--- a/test/nlp/test_autohf_classificationhead.py
+++ b/test/nlp/test_autohf_classificationhead.py
@ -1,14 +1,105 @@
-from utils import get_toy_data_multiclassclassification, get_automl_settings
+from utils import (
+    get_toy_data_regression,
+    get_toy_data_binclassification,
+    get_toy_data_multiclassclassification,
+    get_automl_settings,
+)
+import sys
+import pytest
+import os
+import shutil
+
+data_list = [
+    "get_toy_data_regression",
+    "get_toy_data_binclassification",
+    "get_toy_data_multiclassclassification",
+]
+model_path_list = [
+    "textattack/bert-base-uncased-STS-B",
+    "textattack/bert-base-uncased-SST-2",
+    "textattack/bert-base-uncased-MNLI",
+]


-def test_classification_head():
+def test_switch_1_1():
+    data_idx, model_path_idx = 0, 0
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_1_2():
+    data_idx, model_path_idx = 0, 1
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_1_3():
+    data_idx, model_path_idx = 0, 2
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_2_1():
+    data_idx, model_path_idx = 1, 0
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_2_2():
+    data_idx, model_path_idx = 1, 1
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_2_3():
+    data_idx, model_path_idx = 1, 2
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_3_1():
+    data_idx, model_path_idx = 2, 0
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_3_2():
+    data_idx, model_path_idx = 2, 1
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_3_3():
+    data_idx, model_path_idx = 2, 2
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def _test_switch_classificationhead(each_data, each_model_path):
    from flaml import AutoML
    import requests

-    X_train, y_train, X_val, y_val = get_toy_data_multiclassclassification()
    automl = AutoML()

+    X_train, y_train, X_val, y_val = globals()[each_data]()
    automl_settings = get_automl_settings()
+    automl_settings["model_path"] = each_model_path
+
+    if each_data == "get_toy_data_regression":
+        automl_settings["task"] = "seq-regression"
+        automl_settings["metric"] = "pearsonr"
+    else:
+        automl_settings["task"] = "seq-classification"
+        automl_settings["metric"] = "accuracy"

    try:
        automl.fit(
@ -21,6 +112,9 @@ def test_classification_head():
    except requests.exceptions.HTTPError:
        return

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 if __name__ == "__main__":
-    test_classification_head()
+    _test_switch_classificationhead(data_list[0], model_path_list[0])
--- a/test/nlp/test_autohf_custom_metric.py
+++ b/test/nlp/test_autohf_custom_metric.py
@ -1,6 +1,8 @@
 import sys
 import pytest
 from utils import get_toy_data_seqclassification, get_automl_settings
+import os
+import shutil


 def custom_metric(
@ -81,6 +83,9 @@ def test_custom_metric():

    del automl

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 if __name__ == "__main__":
    test_custom_metric()
--- a/test/nlp/test_autohf_cv.py
+++ b/test/nlp/test_autohf_cv.py
@ -1,6 +1,8 @@
 import sys
 import pytest
 from utils import get_toy_data_seqclassification, get_automl_settings
+import os
+import shutil


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@ -19,6 +21,9 @@ def test_cv():
    except requests.exceptions.HTTPError:
        return

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 if __name__ == "__main__":
    test_cv()
--- a/test/nlp/test_autohf_multichoice_classification.py
+++ b/test/nlp/test_autohf_multichoice_classification.py
@ -1,6 +1,8 @@
 import sys
 import pytest
 from utils import get_toy_data_multiplechoiceclassification, get_automl_settings
+import os
+import shutil


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@ -46,6 +48,9 @@ def test_mcc():
    accuracy = round(true_count / len(y_pred), 5)
    print("Accuracy: " + str(accuracy))

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 if __name__ == "__main__":
    test_mcc()
--- a/test/nlp/test_autohf_regression.py
+++ b/test/nlp/test_autohf_regression.py
@ -1,6 +1,8 @@
 import sys
 import pytest
 from utils import get_toy_data_seqregression, get_automl_settings
+import os
+import shutil


@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@ -32,6 +34,9 @@ def test_regression():
    )
    automl.predict(X_val)

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 if __name__ == "__main__":
    test_regression()
--- a/test/nlp/test_autohf_summarization.py
+++ b/test/nlp/test_autohf_summarization.py
@ -2,6 +2,8 @@ import sys
 import pytest
 import requests
 from utils import get_toy_data_summarization, get_automl_settings
+import os
+import shutil


@pytest.mark.skipif(
@ -48,6 +50,9 @@ def test_summarization():
    )
    automl.predict(X_test)

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 if __name__ == "__main__":
    test_summarization()
--- a/test/nlp/test_autohf_tokenclassification.py
+++ b/test/nlp/test_autohf_tokenclassification.py
@ -1,6 +1,8 @@
 import sys
 import pytest
 import requests
+import os
+import shutil
 from utils import (
    get_toy_data_tokenclassification_idlabel,
    get_toy_data_tokenclassification_tokenlabel,
@ -62,6 +64,9 @@ def test_tokenclassification_idlabel():
                if min_inter_result != sys.maxsize:
                    assert val_loss == min_inter_result

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

@pytest.mark.skipif(
    sys.platform == "darwin" or sys.version < "3.7",
@ -106,6 +111,9 @@ def test_tokenclassification_tokenlabel():
                if min_inter_result != sys.maxsize:
                    assert val_loss == min_inter_result

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 if __name__ == "__main__":
    test_tokenclassification_idlabel()
--- a/test/nlp/test_default.py
+++ b/test/nlp/test_default.py
@ -1,6 +1,8 @@
 from utils import get_toy_data_seqclassification, get_automl_settings
 import sys
 from flaml.default import portfolio
+import os
+import shutil


 def pop_args(fit_kwargs):
@ -80,6 +82,9 @@ def test_starting_point_not_in_search_space():
        == "albert-base-v2"
    )

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 def test_points_to_evaluate():
    from flaml import AutoML
@ -99,6 +104,9 @@ def test_points_to_evaluate():

    automl.fit(X_train, y_train, **automl_settings)

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 # TODO: implement _test_zero_shot_model
 def test_zero_shot_nomodel():
@ -131,6 +139,9 @@ def test_zero_shot_nomodel():
    pop_args(fit_kwargs)
    model.fit(X_train, y_train, **fit_kwargs)

+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+

 def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"):
    import os
@ -159,3 +170,9 @@ def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"):
        )
    except ValueError:
        print("Feature not implemented")
+
+    import os
+    import shutil
+
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
--- a/test/nlp/utils.py
+++ b/test/nlp/utils.py
@ -70,23 +70,19 @@ def get_toy_data_seqclassification():
    return X_train, y_train, X_val, y_val, X_test


-def get_toy_data_multiclassclassification():
+def get_toy_data_binclassification():
    train_data = {
        "text": [
            "i didnt feel humiliated",
            "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
-            "im grabbing a minute to post i feel greedy wrong",
            "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
-            "i am feeling grouchy",
            "ive been feeling a little burdened lately wasnt sure why that was",
-            "ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny",
-            "i feel as confused about life as a teenager or as jaded as a year old man",
            "i have been with petronas for years i feel that petronas has performed well and made a huge profit",
            "i feel romantic too",
            "i feel like i have to make the suffering i m seeing mean something",
            "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
        ],
-        "label": [0, 0, 3, 2, 3, 0, 5, 4, 1, 2, 0, 1],
+        "label": [0, 0, 1, 0, 1, 1, 0, 1],
    }
    train_dataset = pd.DataFrame(train_data)

@ -95,9 +91,84 @@ def get_toy_data_multiclassclassification():
            "i think it s the easiest time of year to feel dissatisfied",
            "i feel low energy i m just thirsty",
            "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
-            "i do not feel reassured anxiety is on each side",
        ],
-        "label": [3, 0, 1, 1],
+        "label": [0, 1, 1],
+    }
+    dev_dataset = pd.DataFrame(dev_data)
+
+    custom_sent_keys = ["text"]
+    label_key = "label"
+
+    X_train = train_dataset[custom_sent_keys]
+    y_train = train_dataset[label_key]
+
+    X_val = dev_dataset[custom_sent_keys]
+    y_val = dev_dataset[label_key]
+
+    return X_train, y_train, X_val, y_val
+
+
+def get_toy_data_regression():
+    train_data = {
+        "text": [
+            "i didnt feel humiliated",
+            "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
+            "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
+            "ive been feeling a little burdened lately wasnt sure why that was",
+            "i have been with petronas for years i feel that petronas has performed well and made a huge profit",
+            "i feel romantic too",
+            "i feel like i have to make the suffering i m seeing mean something",
+            "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
+        ],
+        "label": [1.0, 1.0, 3.0, 1.0, 5.0, 5.0, 1.0, 3.0],
+    }
+    train_dataset = pd.DataFrame(train_data)
+
+    dev_data = {
+        "text": [
+            "i think it s the easiest time of year to feel dissatisfied",
+            "i feel low energy i m just thirsty",
+            "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
+        ],
+        "label": [1.0, 3.0, 3.0],
+    }
+    dev_dataset = pd.DataFrame(dev_data)
+
+    custom_sent_keys = ["text"]
+    label_key = "label"
+
+    X_train = train_dataset[custom_sent_keys]
+    y_train = train_dataset[label_key]
+
+    X_val = dev_dataset[custom_sent_keys]
+    y_val = dev_dataset[label_key]
+
+    return X_train, y_train, X_val, y_val
+
+
+def get_toy_data_multiclassclassification():
+    train_data = {
+        "text": [
+            "i didnt feel humiliated",
+            "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
+            "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
+            "ive been feeling a little burdened lately wasnt sure why that was",
+            "i have been with petronas for years i feel that petronas has performed well and made a huge profit",
+            "i feel romantic too",
+            "i feel like i have to make the suffering i m seeing mean something",
+            "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
+        ],
+        "label": [0, 0, 2, 0, 1, 2, 0, 1],
+    }
+    train_dataset = pd.DataFrame(train_data)
+
+    dev_data = {
+        "text": [
+            "i think it s the easiest time of year to feel dissatisfied",
+            "i feel low energy i m just thirsty",
+            "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
+        ],
+        "label": [0, 1, 1],
    }
    dev_dataset = pd.DataFrame(dev_data)

--- a/website/docs/Examples/AutoML-NLP.md
+++ b/website/docs/Examples/AutoML-NLP.md
@ -38,6 +38,13 @@ automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_
 automl.predict(X_test)
 ```

+Notice that after you run `automl.fit`, the intermediate checkpoints are saved under the specified output_dir `data/output`. You can use the following code to clean these outputs if they consume a large storage space:
+
+```python
+if os.path.exists("data/output/"):
+    shutil.rmtree("data/output/")
+```
+
 #### Sample output

 ```