datetime columns preprocess for validation data fixed. (#73)

* datetime columns preprocess for validation data fixed.

* code line formatted.
This commit is contained in:
Gian Pio Domiziani 2021-04-21 16:22:54 +02:00 committed by GitHub
parent f4f3f4f17b
commit ad42889a3b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 3 deletions

View File

@ -192,12 +192,13 @@ class DataTransformer:
if isinstance(X, pd.DataFrame):
X = X.copy()
n = X.shape[0]
cat_columns, num_columns = [], []
cat_columns, num_columns, datetime_columns = [], [], []
drop = False
for column in X.columns:
# sklearn\utils\validation.py needs int/float values
if X[column].dtype.name == 'datetime64[ns]':
X[column] = X[column].map(datetime.toordinal)
datetime_columns.append(column)
if X[column].dtype.name in ('object', 'category'):
if X[column].nunique() == 1 or X[column].nunique(
dropna=True) == n - X[column].isnull().sum():
@ -236,7 +237,8 @@ class DataTransformer:
SimpleImputer(missing_values=np.nan, strategy='median'),
X_num.columns)])
X[num_columns] = self.transformer.fit_transform(X_num)
self._cat_columns, self._num_columns = cat_columns, num_columns
self._cat_columns, self._num_columns, self._datetime_columns = cat_columns, \
num_columns, datetime_columns
self._drop = drop
if task == 'regression':
@ -249,7 +251,11 @@ class DataTransformer:
def transform(self, X):
if isinstance(X, pd.DataFrame):
cat_columns, num_columns = self._cat_columns, self._num_columns
cat_columns, num_columns, datetime_columns = self._cat_columns, \
self._num_columns, self._datetime_columns
if datetime_columns:
for dt_column in datetime_columns:
X[dt_column] = X[dt_column].map(datetime.toordinal)
X = X[cat_columns + num_columns].copy()
for column in cat_columns:
# print(column, X[column].dtype.name)

View File

@ -4,6 +4,9 @@ import numpy as np
import scipy.sparse
from sklearn.datasets import load_boston, load_iris, load_wine
import pandas as pd
from datetime import datetime
from flaml import AutoML
from flaml.data import get_output_from_log
@ -219,6 +222,23 @@ class TestAutoML(unittest.TestCase):
print(automl_experiment.model)
print(automl_experiment.predict_proba(X_train)[:5])
def test_datetime_columns(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": 'mse',
"task": 'regression',
"log_file_name": "test/datetime_columns.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True
}
fake_df = pd.DataFrame({'A': [datetime(1900, 2, 3), datetime(1900, 3, 4)]})
y = np.array([0, 1])
automl_experiment.fit(X_train=fake_df, X_val=fake_df, y_train=y, y_val=y, **automl_settings)
def test_regression(self):
automl_experiment = AutoML()