autogen/test/test_forecast.py

def test_forecast_automl_df(budget=5):
    # using dataframe
    import statsmodels.api as sm
    data = sm.datasets.co2.load_pandas()
    data = data.data
    data = data['co2'].resample('MS').mean()
    data = data.fillna(data.bfill())
    data = data.to_frame().reset_index()
    data = data.rename(columns={'index': 'ds', 'co2': 'y'})
    num_samples = data.shape[0]
    time_horizon = 12
    split_idx = num_samples - time_horizon
    X_train = data[:split_idx]
    X_test = data[split_idx:]['ds'].to_frame()
    y_test = data[split_idx:]['y'].to_frame()
    ''' import AutoML class from flaml package '''
    from flaml import AutoML
    automl = AutoML()
    settings = {
        "time_budget": budget,  # total running time in seconds
        "metric": 'mape',  # primary metric
        "task": 'forecast',  # task type
        "log_file_name": 'CO2_forecast.log',  # flaml log file
        "eval_method": "holdout",
        "split_type": 'time'
    }
    '''The main flaml automl API'''
    try:
        automl.fit(dataframe=X_train, **settings, period=time_horizon, freq='M')
    except ImportError:
        automl.fit(dataframe=X_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
    ''' retrieve best config and best learner'''
    print('Best ML leaner:', automl.best_estimator)
    print('Best hyperparmeter config:', automl.best_config)
    print(f'Best mape on validation data: {automl.best_loss}')
    print(f'Training duration of best run: {automl.best_config_train_time}s')
    print(automl.model.estimator)
    ''' pickle and save the automl object '''
    import pickle
    with open('automl.pkl', 'wb') as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    ''' compute predictions of testing dataset '''
    y_pred = automl.predict(X_test)
    print('Predicted labels', y_pred)
    print('True labels', y_test)
    ''' compute different metric values on testing dataset'''
    from flaml.ml import sklearn_metric_loss_score
    print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
    from flaml.data import get_output_from_log
    time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
        get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
    for config in config_history:
        print(config)
    print(automl.prune_attr)
    print(automl.max_resource)
    print(automl.min_resource)


def test_forecast_automl_Xy(budget=5):
    # using X_train and y_train
    import statsmodels.api as sm
    data = sm.datasets.co2.load_pandas()
    data = data.data
    data = data['co2'].resample('MS').mean()
    data = data.fillna(data.bfill())
    data = data.to_frame().reset_index()
    num_samples = data.shape[0]
    time_horizon = 12
    split_idx = num_samples - time_horizon
    X_train = data[:split_idx]['index'].to_frame()
    y_train = data[:split_idx]['co2']
    X_test = data[split_idx:]['index'].to_frame()
    y_test = data[split_idx:]['co2'].to_frame()
    ''' import AutoML class from flaml package '''
    from flaml import AutoML
    automl = AutoML()
    settings = {
        "time_budget": budget,  # total running time in seconds
        "metric": 'mape',  # primary metric
        "task": 'forecast',  # task type
        "log_file_name": 'CO2_forecast.log',  # flaml log file
        "eval_method": "holdout",
        "split_type": 'time'
    }
    '''The main flaml automl API'''
    try:
        automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon, freq='M')
    except ImportError:
        automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
    ''' retrieve best config and best learner'''
    print('Best ML leaner:', automl.best_estimator)
    print('Best hyperparmeter config:', automl.best_config)
    print(f'Best mape on validation data: {automl.best_loss}')
    print(f'Training duration of best run: {automl.best_config_train_time}s')
    print(automl.model.estimator)
    ''' pickle and save the automl object '''
    import pickle
    with open('automl.pkl', 'wb') as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    ''' compute predictions of testing dataset '''
    y_pred = automl.predict(X_test)
    print('Predicted labels', y_pred)
    print('True labels', y_test)
    ''' compute different metric values on testing dataset'''
    from flaml.ml import sklearn_metric_loss_score
    print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
    from flaml.data import get_output_from_log
    time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
        get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
    for config in config_history:
        print(config)
    print(automl.prune_attr)
    print(automl.max_resource)
    print(automl.min_resource)


if __name__ == "__main__":
    test_forecast_automl_df(60)
    test_forecast_automl_Xy(60)
Forecast (#162) * added 'forecast' task with estimators ['fbprophet', 'arima', 'sarimax'] * update setup.py * add TimeSeriesSplit to 'regression' and 'classification' task * add 'time' split_type for 'classification' and 'regression' task Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * feature importance * variable name * Update test/test_split.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * Update test/test_forecast.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * prophet installation fail in windows * upload flaml_forecast.ipynb Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> 2021-08-24 04:26:46 +08:00			`def test_forecast_automl_df(budget=5):`
			`# using dataframe`
			`import statsmodels.api as sm`
			`data = sm.datasets.co2.load_pandas()`
			`data = data.data`
			`data = data['co2'].resample('MS').mean()`
			`data = data.fillna(data.bfill())`
			`data = data.to_frame().reset_index()`
			`data = data.rename(columns={'index': 'ds', 'co2': 'y'})`
			`num_samples = data.shape[0]`
			`time_horizon = 12`
			`split_idx = num_samples - time_horizon`
			`X_train = data[:split_idx]`
			`X_test = data[split_idx:]['ds'].to_frame()`
			`y_test = data[split_idx:]['y'].to_frame()`
			`''' import AutoML class from flaml package '''`
			`from flaml import AutoML`
			`automl = AutoML()`
			`settings = {`
			`"time_budget": budget, # total running time in seconds`
			`"metric": 'mape', # primary metric`
			`"task": 'forecast', # task type`
			`"log_file_name": 'CO2_forecast.log', # flaml log file`
			`"eval_method": "holdout",`
			`"split_type": 'time'`
			`}`
			`'''The main flaml automl API'''`
			`try:`
			`automl.fit(dataframe=X_train, **settings, period=time_horizon, freq='M')`
			`except ImportError:`
			`automl.fit(dataframe=X_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')`
			`''' retrieve best config and best learner'''`
			`print('Best ML leaner:', automl.best_estimator)`
			`print('Best hyperparmeter config:', automl.best_config)`
			`print(f'Best mape on validation data: {automl.best_loss}')`
			`print(f'Training duration of best run: {automl.best_config_train_time}s')`
			`print(automl.model.estimator)`
			`''' pickle and save the automl object '''`
			`import pickle`
			`with open('automl.pkl', 'wb') as f:`
			`pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)`
			`''' compute predictions of testing dataset '''`
			`y_pred = automl.predict(X_test)`
			`print('Predicted labels', y_pred)`
			`print('True labels', y_test)`
			`''' compute different metric values on testing dataset'''`
			`from flaml.ml import sklearn_metric_loss_score`
			`print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))`
			`from flaml.data import get_output_from_log`
			`time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \`
			`get_output_from_log(filename=settings['log_file_name'], time_budget=budget)`
			`for config in config_history:`
			`print(config)`
			`print(automl.prune_attr)`
			`print(automl.max_resource)`
			`print(automl.min_resource)`


			`def test_forecast_automl_Xy(budget=5):`
			`# using X_train and y_train`
			`import statsmodels.api as sm`
			`data = sm.datasets.co2.load_pandas()`
			`data = data.data`
			`data = data['co2'].resample('MS').mean()`
			`data = data.fillna(data.bfill())`
			`data = data.to_frame().reset_index()`
			`num_samples = data.shape[0]`
			`time_horizon = 12`
			`split_idx = num_samples - time_horizon`
			`X_train = data[:split_idx]['index'].to_frame()`
			`y_train = data[:split_idx]['co2']`
			`X_test = data[split_idx:]['index'].to_frame()`
			`y_test = data[split_idx:]['co2'].to_frame()`
			`''' import AutoML class from flaml package '''`
			`from flaml import AutoML`
			`automl = AutoML()`
			`settings = {`
			`"time_budget": budget, # total running time in seconds`
			`"metric": 'mape', # primary metric`
			`"task": 'forecast', # task type`
			`"log_file_name": 'CO2_forecast.log', # flaml log file`
			`"eval_method": "holdout",`
			`"split_type": 'time'`
			`}`
			`'''The main flaml automl API'''`
			`try:`
			`automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon, freq='M')`
			`except ImportError:`
			`automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')`
			`''' retrieve best config and best learner'''`
			`print('Best ML leaner:', automl.best_estimator)`
			`print('Best hyperparmeter config:', automl.best_config)`
			`print(f'Best mape on validation data: {automl.best_loss}')`
			`print(f'Training duration of best run: {automl.best_config_train_time}s')`
			`print(automl.model.estimator)`
			`''' pickle and save the automl object '''`
			`import pickle`
			`with open('automl.pkl', 'wb') as f:`
			`pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)`
			`''' compute predictions of testing dataset '''`
			`y_pred = automl.predict(X_test)`
			`print('Predicted labels', y_pred)`
			`print('True labels', y_test)`
			`''' compute different metric values on testing dataset'''`
			`from flaml.ml import sklearn_metric_loss_score`
			`print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))`
			`from flaml.data import get_output_from_log`
			`time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \`
			`get_output_from_log(filename=settings['log_file_name'], time_budget=budget)`
			`for config in config_history:`
			`print(config)`
			`print(automl.prune_attr)`
			`print(automl.max_resource)`
			`print(automl.min_resource)`


			`if __name__ == "__main__":`
			`test_forecast_automl_df(60)`
			`test_forecast_automl_Xy(60)`