diff --git a/doc/src/sgml/ref/create_model.sgmlin b/doc/src/sgml/ref/create_model.sgmlin index 3cb22c661..523e23063 100644 --- a/doc/src/sgml/ref/create_model.sgmlin +++ b/doc/src/sgml/ref/create_model.sgmlin @@ -80,6 +80,7 @@ For example: * verbose: 0 (no output), 1 (less output), or 2 (full output) # Hyperparameter list for 'xgboost_regression_logistic', 'xgboost_binary_logistic', 'xgboost_regression_gamma' and 'xgboost_regression_squarederror': + * n_iter: Maximum iterations until convergence * batch_size: Number of tuples in each processing batch * booster: Which booster to use, e.g., gbtree, gblinear or dart (default: gbtree) * tree_method: The tree construction algorithm used in XGBoost. Choices: auto, exact, approx, hist, gpu_hist (gpu_hist only supported with GPU) diff --git a/src/gausskernel/dbmind/db4ai/executor/gd/pca.cpp b/src/gausskernel/dbmind/db4ai/executor/gd/pca.cpp index 3cad823ea..818dc1f04 100644 --- a/src/gausskernel/dbmind/db4ai/executor/gd/pca.cpp +++ b/src/gausskernel/dbmind/db4ai/executor/gd/pca.cpp @@ -257,7 +257,7 @@ static Datum pca_predict(const Matrix *features, const Matrix *weights, static HyperparameterDefinition pca_hyperparameter_definitions[] = { HYPERPARAMETER_INT4("number_components", 1, 1, true, INT32_MAX, true, HyperparametersGD, number_dimensions, HP_NO_AUTOML()), - HYPERPARAMETER_INT4("batch_size", 1000, 1, true, INT32_MAX, true, HyperparametersGD, batch_size, HP_NO_AUTOML()), + HYPERPARAMETER_INT4("batch_size", 1000, 1, true, MAX_BATCH_SIZE, true, HyperparametersGD, batch_size, HP_NO_AUTOML()), HYPERPARAMETER_INT4("max_iterations", 100, 1, true, ITER_MAX, true, HyperparametersGD, max_iterations, HP_NO_AUTOML()), HYPERPARAMETER_INT4("max_seconds", 0, 0, true, INT32_MAX, true, HyperparametersGD, max_seconds, HP_NO_AUTOML()), diff --git a/src/gausskernel/dbmind/db4ai/executor/kmeans/kmeans.cpp b/src/gausskernel/dbmind/db4ai/executor/kmeans/kmeans.cpp index 7ded06b65..79988a099 100644 --- a/src/gausskernel/dbmind/db4ai/executor/kmeans/kmeans.cpp +++ b/src/gausskernel/dbmind/db4ai/executor/kmeans/kmeans.cpp @@ -1044,7 +1044,7 @@ HyperparameterDefinition kmeans_hyperparameter_definitions[] = { HYPERPARAMETER_INT4("max_iterations", 10, 1, true, ITER_MAX, true, HyperparametersKMeans, num_iterations, HP_NO_AUTOML()), HYPERPARAMETER_INT4("num_features", 0, 1, true, INT32_MAX, true, HyperparametersKMeans, n_features, HP_NO_AUTOML()), - HYPERPARAMETER_INT4("batch_size", 1000, 1, true, 1000000, true, HyperparametersKMeans, batch_size, HP_NO_AUTOML()), + HYPERPARAMETER_INT4("batch_size", 1000, 1, true, MAX_BATCH_SIZE, true, HyperparametersKMeans, batch_size, HP_NO_AUTOML()), HYPERPARAMETER_INT4("seed", 0, 0, true, INT32_MAX, true, HyperparametersKMeans, external_seed, HP_AUTOML_INT(1, INT32_MAX, 1, ProbabilityDistribution::UNIFORM_RANGE)), HYPERPARAMETER_FLOAT8("tolerance", 0.00001, 0.0, false, 1.0, true, HyperparametersKMeans, tolerance, diff --git a/src/gausskernel/dbmind/db4ai/executor/xgboost/xgboost.cpp b/src/gausskernel/dbmind/db4ai/executor/xgboost/xgboost.cpp index 288caf004..e5b649e65 100644 --- a/src/gausskernel/dbmind/db4ai/executor/xgboost/xgboost.cpp +++ b/src/gausskernel/dbmind/db4ai/executor/xgboost/xgboost.cpp @@ -226,13 +226,13 @@ const char *xgboost_tree_method_str[] = {"auto", "exact", "approx", "hist", "gpu const char *xgboost_eval_metric_str[] = {"rmse", "rmsle", "map", "mae", "auc", "aucpr" }; static HyperparameterDefinition xgboost_hyperparameter_definitions[] = { HYPERPARAMETER_INT4("n_iter", 10, 1, true, ITER_MAX, true, HyperparamsXGBoost, n_iterations, HP_NO_AUTOML()), - HYPERPARAMETER_INT4("batch_size", 10000, 1, true, INT32_MAX, true, HyperparamsXGBoost, batch_size, HP_NO_AUTOML()), + HYPERPARAMETER_INT4("batch_size", 10000, 1, true, MAX_BATCH_SIZE, true, HyperparamsXGBoost, batch_size, HP_NO_AUTOML()), HYPERPARAMETER_INT4("max_depth", 5, 0, true, INT32_MAX, true, HyperparamsXGBoost, max_depth, HP_NO_AUTOML()), HYPERPARAMETER_INT4("min_child_weight", 1, 0, true, INT32_MAX, true, HyperparamsXGBoost, min_child_weight, HP_NO_AUTOML()), - HYPERPARAMETER_FLOAT8("gamma", 0.0, 0.0, true, 1, true, HyperparamsXGBoost, gamma, HP_NO_AUTOML()), + HYPERPARAMETER_FLOAT8("gamma", 0.0, 0.0, true, DBL_MAX, true, HyperparamsXGBoost, gamma, HP_NO_AUTOML()), HYPERPARAMETER_FLOAT8("eta", 0.3, 0.0, true, 1, true, HyperparamsXGBoost, eta, HP_NO_AUTOML()), - HYPERPARAMETER_INT4("nthread", 1, 0, true, INT32_MAX, true, HyperparamsXGBoost, nthread, HP_NO_AUTOML()), + HYPERPARAMETER_INT4("nthread", 1, 0, true, 100, true, HyperparamsXGBoost, nthread, HP_NO_AUTOML()), HYPERPARAMETER_INT4("verbosity", 1, 0, true, 3, true, HyperparamsXGBoost, verbosity, HP_NO_AUTOML()), HYPERPARAMETER_INT4("seed", 0, 0, true, INT32_MAX, true, HyperparamsXGBoost, seed, HP_AUTOML_INT(1, INT32_MAX, 1, ProbabilityDistribution::UNIFORM_RANGE)), diff --git a/src/include/db4ai/db4ai_common.h b/src/include/db4ai/db4ai_common.h index 104ea26c7..880eb7106 100644 --- a/src/include/db4ai/db4ai_common.h +++ b/src/include/db4ai/db4ai_common.h @@ -26,6 +26,7 @@ #include "utils/timestamp.h" #define ITER_MAX 10000 +#define MAX_BATCH_SIZE 0x0fffff uint64_t time_diff(struct timespec *time_p1, struct timespec *time_p2); double interval_to_sec(double time_interval); diff --git a/src/include/db4ai/gd.h b/src/include/db4ai/gd.h index 471a6b6ab..394f272f6 100644 --- a/src/include/db4ai/gd.h +++ b/src/include/db4ai/gd.h @@ -261,7 +261,7 @@ typedef struct HyperparametersGD { } HyperparametersGD; #define GD_HYPERPARAMETERS_SUPERVISED \ - HYPERPARAMETER_INT4("batch_size", 1000, 1, true, INT32_MAX, true, \ + HYPERPARAMETER_INT4("batch_size", 1000, 1, true, MAX_BATCH_SIZE, true, \ HyperparametersGD, batch_size, \ HP_AUTOML_INT(1, 10000, 4, ProbabilityDistribution::LOG_RANGE)), \ HYPERPARAMETER_FLOAT8("decay", 0.95, 0.0, false, DBL_MAX, true, \ diff --git a/src/test/regress/input/db4ai_kmeans_train_predict.source b/src/test/regress/input/db4ai_kmeans_train_predict.source index 05e38f476..9d0ab298e 100644 --- a/src/test/regress/input/db4ai_kmeans_train_predict.source +++ b/src/test/regress/input/db4ai_kmeans_train_predict.source @@ -1201,7 +1201,7 @@ CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivar -- Batch size CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivariate_7_1000_10) WITH max_iterations = 50, num_centroids = 10, tolerance = 0.00001, batch_size = 0, num_features = 7, distance_function = 'L2_Squared', seeding_function = 'Random++', verbose = 1, seed = 1255025990; -CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivariate_7_1000_10) WITH max_iterations = 50, num_centroids = 10, tolerance = 0.00001, batch_size = 1000001, num_features = 7, distance_function = 'L2_Squared', seeding_function = 'Random++', verbose = 1, seed = 1255025990; +CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivariate_7_1000_10) WITH max_iterations = 50, num_centroids = 10, tolerance = 0.00001, batch_size = 1048576, num_features = 7, distance_function = 'L2_Squared', seeding_function = 'Random++', verbose = 1, seed = 1255025990; -- Num of features (not matching the data) CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivariate_7_1000_10) WITH max_iterations = 50, num_centroids = 10, tolerance = 0.00001, batch_size = 1000, num_features = 9, distance_function = 'L2_Squared', seeding_function = 'Random++', verbose = 1, seed = 1255025990; diff --git a/src/test/regress/output/db4ai_gd_train_predict.source b/src/test/regress/output/db4ai_gd_train_predict.source index 65ef18577..4d4501e35 100644 --- a/src/test/regress/output/db4ai_gd_train_predict.source +++ b/src/test/regress/output/db4ai_gd_train_predict.source @@ -14,13 +14,13 @@ CREATE MODEL m using logistic_regression FEATURES size,lot FROM db4ai_houses; ERROR: Supervised ML algorithms require TARGET clause -- Errors with semantic validation of hyperparameters CREATE MODEL m USING logistic_regression FEATURES size, lot TARGET price <100000 FROM db4ai_houses with batch_size = 0, seed=1; -ERROR: Hyperparameter batch_size must be in the range [1,2147483647] +ERROR: Hyperparameter batch_size must be in the range [1,1048575] CREATE MODEL m USING logistic_regression FEATURES size, lot TARGET price <100000 FROM db4ai_houses with decay = 0.0, seed=1; ERROR: Hyperparameter decay must be in the range (0,1.7976931e+308] CREATE MODEL m USING logistic_regression FEATURES size, lot TARGET price <100000 FROM db4ai_houses with learning_rate = 0.0, seed=1; ERROR: Hyperparameter learning_rate must be in the range (0,1.7976931e+308] CREATE MODEL m USING logistic_regression FEATURES size, lot TARGET price <100000 FROM db4ai_houses with max_iterations = 0, seed=1; -ERROR: Hyperparameter max_iterations must be in the range [1,2147483647] +ERROR: Hyperparameter max_iterations must be in the range [1,10000] CREATE MODEL m USING logistic_regression FEATURES size, lot TARGET price <100000 FROM db4ai_houses with max_seconds = -1, seed=1; ERROR: Hyperparameter max_seconds must be in the range [0,2147483647] CREATE MODEL m USING logistic_regression FEATURES size, lot TARGET price <100000 FROM db4ai_houses with optimizer = nogd, seed=1; diff --git a/src/test/regress/output/db4ai_kmeans_train_predict.source b/src/test/regress/output/db4ai_kmeans_train_predict.source index 4aa618da7..64e5bee71 100644 --- a/src/test/regress/output/db4ai_kmeans_train_predict.source +++ b/src/test/regress/output/db4ai_kmeans_train_predict.source @@ -742,7 +742,7 @@ CONTEXT: referenced column: centroid_id -- Wrong parameters -- Number of iterations CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivariate_7_1000_10) WITH max_iterations = 0, num_centroids = 10, tolerance = 0.00001, batch_size = 1000, num_features = 7, distance_function = 'L2_Squared', seeding_function = 'Random++', verbose = 1, seed = 1255025990; -ERROR: Hyperparameter max_iterations must be in the range [1,2147483647] +ERROR: Hyperparameter max_iterations must be in the range [1,10000] -- Number of centroids CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivariate_7_1000_10) WITH max_iterations = 50, num_centroids = 0, tolerance = 0.00001, batch_size = 1000, num_features = 7, distance_function = 'L2_Squared', seeding_function = 'Random++', verbose = 1, seed = 1255025990; ERROR: Hyperparameter num_centroids must be in the range [1,1000000] @@ -755,9 +755,9 @@ CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivar ERROR: Hyperparameter tolerance must be in the range (0,1] -- Batch size CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivariate_7_1000_10) WITH max_iterations = 50, num_centroids = 10, tolerance = 0.00001, batch_size = 0, num_features = 7, distance_function = 'L2_Squared', seeding_function = 'Random++', verbose = 1, seed = 1255025990; -ERROR: Hyperparameter batch_size must be in the range [1,1000000] -CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivariate_7_1000_10) WITH max_iterations = 50, num_centroids = 10, tolerance = 0.00001, batch_size = 1000001, num_features = 7, distance_function = 'L2_Squared', seeding_function = 'Random++', verbose = 1, seed = 1255025990; -ERROR: Hyperparameter batch_size must be in the range [1,1000000] +ERROR: Hyperparameter batch_size must be in the range [1,1048575] +CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivariate_7_1000_10) WITH max_iterations = 50, num_centroids = 10, tolerance = 0.00001, batch_size = 1048576, num_features = 7, distance_function = 'L2_Squared', seeding_function = 'Random++', verbose = 1, seed = 1255025990; +ERROR: Hyperparameter batch_size must be in the range [1,1048575] -- Num of features (not matching the data) CREATE MODEL my_kmeans_pp_empty USING kmeans FROM (SELECT position FROM multivariate_7_1000_10) WITH max_iterations = 50, num_centroids = 10, tolerance = 0.00001, batch_size = 1000, num_features = 9, distance_function = 'L2_Squared', seeding_function = 'Random++', verbose = 1, seed = 1255025990; NOTICE: *** Initial statistics gathered: diff --git a/src/test/regress/output/db4ai_xgboost_train_predict.source b/src/test/regress/output/db4ai_xgboost_train_predict.source index b0124609c..4d8e6c52c 100644 --- a/src/test/regress/output/db4ai_xgboost_train_predict.source +++ b/src/test/regress/output/db4ai_xgboost_train_predict.source @@ -12,9 +12,9 @@ CREATE MODEL m using xgboost_binary_logistic FROM db4ai_rain; ERROR: Supervised ML algorithms require FEATURES clause -- Errors with semantic validation of hyperparameters CREATE MODEL m USING xgboost_binary_logistic FEATURES rainfall, temp9am TARGET raintoday FROM db4ai_rain WITH n_iter=-1; -ERROR: Hyperparameter n_iter must be in the range [1,2147483647] +ERROR: Hyperparameter n_iter must be in the range [1,10000] CREATE MODEL m USING xgboost_binary_logistic FEATURES rainfall, temp9am TARGET raintoday FROM db4ai_rain WITH batch_size=0; -ERROR: Hyperparameter batch_size must be in the range [1,2147483647] +ERROR: Hyperparameter batch_size must be in the range [1,1048575] CREATE MODEL m USING xgboost_binary_logistic FEATURES rainfall, temp9am TARGET raintoday FROM db4ai_rain WITH max_depth=-1; ERROR: Hyperparameter max_depth must be in the range [0,2147483647] CREATE MODEL m USING xgboost_binary_logistic FEATURES rainfall, temp9am TARGET raintoday FROM db4ai_rain WITH min_child_weight=-1; @@ -24,7 +24,7 @@ ERROR: Hyperparameter eta must be in the range [0,1] CREATE MODEL m USING xgboost_binary_logistic FEATURES rainfall, temp9am TARGET raintoday FROM db4ai_rain WITH seed=-1; ERROR: Hyperparameter seed must be in the range [0,2147483647] CREATE MODEL m USING xgboost_binary_logistic FEATURES rainfall, temp9am TARGET raintoday FROM db4ai_rain WITH nthread=-1; -ERROR: Hyperparameter nthread must be in the range [0,2147483647] +ERROR: Hyperparameter nthread must be in the range [0,100] CREATE MODEL m USING xgboost_binary_logistic FEATURES rainfall, temp9am TARGET raintoday FROM db4ai_rain WITH booster=10; ERROR: Hyperparameter booster must be a string CREATE MODEL m USING xgboost_binary_logistic FEATURES rainfall, temp9am TARGET raintoday FROM db4ai_rain WITH tree_method=10;