diff --git a/docs/ML_PARAMETER_COVERAGE.md b/docs/ML_PARAMETER_COVERAGE.md index 30a7e9f..e880685 100644 --- a/docs/ML_PARAMETER_COVERAGE.md +++ b/docs/ML_PARAMETER_COVERAGE.md @@ -66,4 +66,18 @@ python collect_ml_search_data.py --list --- +--- + +## LightGBM Classifier Function + +**Function ID:** `lightgbm_classifier` + +ℹ️ **Status:** Database exists but contains no data + +## LightGBM Regressor Function + +**Function ID:** `lightgbm_regressor` + +ℹ️ **Status:** Database exists but contains no data + *Generated on 2025-08-09 14:19:51* diff --git a/docs/_generators/generate_diagrams.py b/docs/_generators/generate_diagrams.py index 0adf005..2d9f5aa 100644 --- a/docs/_generators/generate_diagrams.py +++ b/docs/_generators/generate_diagrams.py @@ -640,7 +640,7 @@ def main(): for filename, generator in generators.items(): content = generator() output_path = DIAGRAMS_DIR / filename - output_path.write_text(content) + output_path.write_text(content, encoding="utf-8") print(f" Generated: {filename}") diff --git a/docs/ml_parameter_coverage.json b/docs/ml_parameter_coverage.json index 6bbbdd4..122fbf3 100644 --- a/docs/ml_parameter_coverage.json +++ b/docs/ml_parameter_coverage.json @@ -235,6 +235,160 @@ ] } } +} + } + , +"lightgbm_classifier": { + "default_ranges": { + "n_estimators": { + "type": "numeric", + "default_count": 29, + "default_values": ["10", "50", "100", "200", "500"], + "is_truncated": true + }, + "learning_rate": { + "type": "numeric", + "default_count": 4, + "default_values": ["0.01", "0.05", "0.1", "0.3"], + "is_truncated": false + }, + "num_leaves": { + "type": "numeric", + "default_count": 18, + "default_values": ["20", "31", "40", "50"], + "is_truncated": true + }, + "max_depth": { + "type": "numeric", + "default_count": 18, + "default_values": ["-1", "5", "10", "15"], + "is_truncated": true + }, + "min_child_samples": { + "type": "numeric", + "default_count": 19, + "default_values": ["10", "20", "30"], + "is_truncated": true + }, + "subsample": { + "type": "numeric", + "default_count": 10, + "default_values": ["0.8", "0.9", "1.0"], + "is_truncated": false + }, + "colsample_bytree": { + "type": "numeric", + "default_count": 10, + "default_values": ["0.8", "0.9", "1.0"], + "is_truncated": false + }, + "reg_alpha": { + "type": "numeric", + "default_count": 6, + "default_values": ["0.0", "0.1", "0.5"], + "is_truncated": false + }, + "reg_lambda": { + "type": "numeric", + "default_count": 5, + "default_values": ["0.0", "0.1", "0.5"], + "is_truncated": false + }, + "dataset": { + "type": "categorical", + "default_count": 5, + "default_values": [ + "", + "", + "" + ], + "is_truncated": false + }, + "cv": { + "type": "numeric", + "default_count": 4, + "default_values": [2, 3, 4, 5], + "is_truncated": false + } + }, + "stored_ranges": { + "exists": true, + "message": "Database exists but contains no data" + } + }, + "lightgbm_regressor": { + "default_ranges": { + "n_estimators": { + "type": "numeric", + "default_count": 29, + "default_values": ["10", "50", "100", "200", "500"], + "is_truncated": true + }, + "learning_rate": { + "type": "numeric", + "default_count": 4, + "default_values": ["0.01", "0.05", "0.1", "0.3"], + "is_truncated": false + }, + "num_leaves": { + "type": "numeric", + "default_count": 18, + "default_values": ["20", "31", "40", "50"], + "is_truncated": true + }, + "max_depth": { + "type": "numeric", + "default_count": 18, + "default_values": ["-1", "5", "10", "15"], + "is_truncated": true + }, + "min_child_samples": { + "type": "numeric", + "default_count": 19, + "default_values": ["10", "20", "30"], + "is_truncated": true + }, + "subsample": { + "type": "numeric", + "default_values": ["0.8", "0.9", "1.0"], + "is_truncated": false + }, + "colsample_bytree": { + "type": "numeric", + "default_count": 10, + "default_values": ["0.8", "0.9", "1.0"], + "is_truncated": false + }, + "reg_alpha": { + "type": "numeric", + "default_count": 6, + "default_values": ["0.0", "0.1", "0.5"], + "is_truncated": false + }, + "reg_lambda": { + "type": "numeric", + "default_count": 5, + "default_values": ["0.0", "0.1", "0.5"], + "is_truncated": false + }, + "dataset": { + "type": "categorical", + "default_count": 5, + "default_values": [ + "" + ], + "is_truncated": false + }, + "cv": { + "type": "numeric", + "default_count": 4, + "default_values": [2, 3, 4, 5], + "is_truncated": false + } + }, + "stored_ranges": { + "exists": true, + "message": "Database exists but contains no data" } } } \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index d06da00..28681e7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -18,6 +18,7 @@ numpy pandas scikit-learn matplotlib +lightgbm # For plot generation plotly>=5.0 diff --git a/src/surfaces/test_functions/algebraic/__init__.py b/src/surfaces/test_functions/algebraic/__init__.py index 1742255..13d3d46 100644 --- a/src/surfaces/test_functions/algebraic/__init__.py +++ b/src/surfaces/test_functions/algebraic/__init__.py @@ -69,6 +69,7 @@ standard_functions_1d, standard_functions_2d, standard_functions_nd, + ShekelFunction, ) __all__ = [ @@ -108,6 +109,7 @@ "RosenbrockFunction", "SphereFunction", "StyblinskiTangFunction", + "ShekelFunction" # Constrained "CantileverBeamFunction", "PressureVesselFunction", diff --git a/src/surfaces/test_functions/algebraic/standard/__init__.py b/src/surfaces/test_functions/algebraic/standard/__init__.py index f1cfeed..1e76259 100644 --- a/src/surfaces/test_functions/algebraic/standard/__init__.py +++ b/src/surfaces/test_functions/algebraic/standard/__init__.py @@ -42,6 +42,7 @@ RosenbrockFunction, SphereFunction, StyblinskiTangFunction, + ShekelFunction ) __all__ = [ @@ -76,6 +77,7 @@ "RosenbrockFunction", "SphereFunction", "StyblinskiTangFunction", + "ShekelFunction" ] standard_functions = [ @@ -110,6 +112,7 @@ RosenbrockFunction, SphereFunction, StyblinskiTangFunction, + ShekelFunction ] standard_functions_1d = [ @@ -147,4 +150,5 @@ RosenbrockFunction, SphereFunction, StyblinskiTangFunction, + ShekelFunction ] diff --git a/src/surfaces/test_functions/algebraic/standard/test_functions_nd/__init__.py b/src/surfaces/test_functions/algebraic/standard/test_functions_nd/__init__.py index 0269746..7ae843a 100644 --- a/src/surfaces/test_functions/algebraic/standard/test_functions_nd/__init__.py +++ b/src/surfaces/test_functions/algebraic/standard/test_functions_nd/__init__.py @@ -8,6 +8,7 @@ from .rosenbrock_function import RosenbrockFunction from .sphere_function import SphereFunction from .styblinski_tang_function import StyblinskiTangFunction +from .shekel_function import ShekelFunction __all__ = [ "RastriginFunction", @@ -15,4 +16,5 @@ "SphereFunction", "StyblinskiTangFunction", "GriewankFunction", + "ShekelFunction", ] diff --git a/src/surfaces/test_functions/algebraic/standard/test_functions_nd/shekel_function.py b/src/surfaces/test_functions/algebraic/standard/test_functions_nd/shekel_function.py new file mode 100644 index 0000000..7d297cd --- /dev/null +++ b/src/surfaces/test_functions/algebraic/standard/test_functions_nd/shekel_function.py @@ -0,0 +1,178 @@ +# Author: Zohaib Hassan + +from typing import Any, Callable, Dict, List, Optional, Union +import numpy as np +from surfaces._array_utils import ArrayLike, get_array_namespace +from surfaces.modifiers import BaseModifier + +from ..._base_algebraic_function import AlgebraicFunction + + +class ShekelFunction(AlgebraicFunction): + """Shekel 4-dimensional test function. + + A multimodal, non-convex, continuous function. It is defined as the + sum of m inverse quadratic functions. + + The function is defined as: + + .. math:: + + f(\\vec{x}) = - \\sum_{i=1}^{m} \\left( \\sum_{j=1}^{4} (x_j - a_{ij})^2 + c_i \\right)^{-1} + + where :math:`m` is the number of maxima (typically 5, 7, or 10). + + The global minimum is located at :math:`x \\approx (4, 4, 4, 4)` and + the value depends on :math:`m`. + + Parameters + ---------- + m : int, default=10 + Number of maxima. Standard values are 5, 7, or 10. + metric : str, default="score" + Either "loss" (minimize) or "score" (maximize). + modifiers : list of BaseModifier, optional + List of modifiers to apply to function evaluations. + validate : bool, default=True + Whether to validate parameters against the search space. + + Attributes + ---------- + n_dim : int + Number of dimensions (fixed at 4 for standard Shekel). + default_bounds : tuple + Default parameter bounds (0.0, 10.0). + + Examples + -------- + >>> from surfaces.test_functions import ShekelFunction + >>> func = ShekelFunction(m=10) + >>> result = func({"x0": 4.0, "x1": 4.0, "x2": 4.0, "x3": 4.0}) + >>> float(result) < -10.0 + True + >>> len(func.search_space) + 4 + """ + + name = "Shekel Function" + _name_ = "shekel_function" + __name__ = "ShekelFunction" + + _spec = { + "convex": False, + "unimodal": False, + "separable": False, + "scalable": False, + } + + f_global = -10.536 + default_bounds = (0.0, 10.0) + + latex_formula = r"f(\vec{x}) = - \sum_{i=1}^{m} \left( \sum_{j=1}^{4} (x_j - a_{ij})^2 + c_i \right)^{-1}" + pgfmath_formula = None + + # Function sheet attributes + tagline = ( + "A multimodal function with m sharp peaks. Often called 'Foxholes', " + "it tests an optimizer's ability to find a global minimum among many locals." + ) + display_bounds = (0.0, 10.0) + display_projection = {"fixed_values": {"x2": 4.0, "x3": 4.0}} + reference = "Shekel, J. (1971). Test function for multivariate search problems." + reference_url = "https://www.sfu.ca/~ssurjano/shekel.html" + + def __init__( + self, + m: int = 10, + objective: str = "minimize", + modifiers: Optional[List[BaseModifier]] = None, + memory: bool = False, + collect_data: bool = True, + callbacks: Optional[Union[Callable, List[Callable]]] = None, + catch_errors: Optional[Dict[type, float]] = None, + ) -> None: + super().__init__(objective, modifiers, memory, collect_data, callbacks, catch_errors) + self.n_dim = 4 + self.m = m + + self.A = np.array([ + [4.0, 4.0, 4.0, 4.0], + [1.0, 1.0, 1.0, 1.0], + [8.0, 8.0, 8.0, 8.0], + [6.0, 6.0, 6.0, 6.0], + [3.0, 7.0, 3.0, 7.0], + [2.0, 9.0, 2.0, 9.0], + [5.0, 5.0, 3.0, 3.0], + [8.0, 1.0, 8.0, 1.0], + [6.0, 2.0, 6.0, 2.0], + [7.0, 3.6, 7.0, 3.6], + ]) + + + self.c = np.array([0.1, 0.2, 0.2, 0.4, 0.4, 0.6, 0.3, 0.7, 0.5, 0.5]) + + + if m < 10: + self.A = self.A[:m] + self.c = self.c[:m] + + self.x_global = (4.0, 4.0, 4.0, 4.0) + + def _create_objective_function(self) -> None: + def shekel_function(params: Dict[str, Any]) -> float: + x_input = np.array([params[f"x{i}"] for i in range(self.n_dim)]) + + result = 0.0 + for i in range(self.m): + # (x - a_i)^T (x - a_i) + diff = x_input - self.A[i] + sq_sum = np.dot(diff, diff) + + result -= 1.0 / (sq_sum + self.c[i]) + + return result + + self.pure_objective_function = shekel_function + + def _batch_objective(self, X: ArrayLike) -> ArrayLike: + """Vectorized batch evaluation. + + Parameters + ---------- + X : ArrayLike + Array of shape (n_points, n_dim). + + Returns + ------- + ArrayLike + Array of shape (n_points,). + """ + xp = get_array_namespace(X) + + + A = xp.asarray(self.A) + c = xp.asarray(self.c) + + n_points = X.shape[0] + result = xp.zeros(n_points) + + for i in range(self.m): + + diff = X - A[i] + + + sq_sum = xp.sum(diff**2, axis=1) + + + result -= 1.0 / (sq_sum + c[i]) + + return result + + def _search_space( + self, + min: float = 0.0, + max: float = 10.0, + size: int = 10000, + value_types: str = "array", + ) -> Dict[str, Any]: + return super()._create_n_dim_search_space(min, max, size=size, value_types=value_types) \ No newline at end of file diff --git a/src/surfaces/test_functions/machine_learning/__init__.py b/src/surfaces/test_functions/machine_learning/__init__.py index d1260d0..4c0aaf1 100644 --- a/src/surfaces/test_functions/machine_learning/__init__.py +++ b/src/surfaces/test_functions/machine_learning/__init__.py @@ -33,6 +33,9 @@ def _check_sklearn(): KNeighborsClassifierFunction, KNeighborsRegressorFunction, KNNTSClassifierFunction, + # LightGBM + LightGBMClassifierFunction, + LightGBMRegressorFunction, RandomForestClassifierFunction, RandomForestForecasterFunction, # Image @@ -51,12 +54,14 @@ def _check_sklearn(): "KNeighborsClassifierFunction", "RandomForestClassifierFunction", "SVMClassifierFunction", + "LightGBMClassifierFunction", # Tabular - Regression "DecisionTreeRegressorFunction", "GradientBoostingRegressorFunction", "KNeighborsRegressorFunction", "RandomForestRegressorFunction", "SVMRegressorFunction", + "LightGBMRegressorFunction", # Time-series - Forecasting "GradientBoostingForecasterFunction", "RandomForestForecasterFunction", @@ -75,12 +80,14 @@ def _check_sklearn(): KNeighborsClassifierFunction, RandomForestClassifierFunction, SVMClassifierFunction, + LightGBMClassifierFunction, # Tabular - Regression DecisionTreeRegressorFunction, GradientBoostingRegressorFunction, KNeighborsRegressorFunction, RandomForestRegressorFunction, SVMRegressorFunction, + LightGBMRegressorFunction, # Time-series - Forecasting GradientBoostingForecasterFunction, RandomForestForecasterFunction, diff --git a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/__init__.py b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/__init__.py index 2d86ab6..74bbd3a 100644 --- a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/__init__.py +++ b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/__init__.py @@ -34,6 +34,8 @@ GradientBoostingRegressorFunction, KNeighborsClassifierFunction, KNeighborsRegressorFunction, + LightGBMClassifierFunction, + LightGBMRegressorFunction, RandomForestClassifierFunction, RandomForestRegressorFunction, SVMClassifierFunction, @@ -57,12 +59,14 @@ "KNeighborsClassifierFunction", "RandomForestClassifierFunction", "SVMClassifierFunction", + "LightGBMClassifierFunction", # Tabular - Regression "DecisionTreeRegressorFunction", "GradientBoostingRegressorFunction", "KNeighborsRegressorFunction", "RandomForestRegressorFunction", "SVMRegressorFunction", + "LightGBMRegressorFunction", # Time-series - Forecasting "GradientBoostingForecasterFunction", "RandomForestForecasterFunction", diff --git a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/__init__.py b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/__init__.py index 569b83e..549123d 100644 --- a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/__init__.py +++ b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/__init__.py @@ -6,6 +6,7 @@ DecisionTreeClassifierFunction, GradientBoostingClassifierFunction, KNeighborsClassifierFunction, + LightGBMClassifierFunction, RandomForestClassifierFunction, SVMClassifierFunction, ) @@ -13,6 +14,7 @@ DecisionTreeRegressorFunction, GradientBoostingRegressorFunction, KNeighborsRegressorFunction, + LightGBMRegressorFunction, RandomForestRegressorFunction, SVMRegressorFunction, ) @@ -24,10 +26,12 @@ "KNeighborsClassifierFunction", "RandomForestClassifierFunction", "SVMClassifierFunction", + "LightGBMClassifierFunction", # Regression "DecisionTreeRegressorFunction", "GradientBoostingRegressorFunction", "KNeighborsRegressorFunction", "RandomForestRegressorFunction", "SVMRegressorFunction", + "LightGBMRegressorFunction", ] diff --git a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/__init__.py b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/__init__.py index d686b8a..07382fd 100644 --- a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/__init__.py +++ b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/__init__.py @@ -6,6 +6,7 @@ DecisionTreeClassifierFunction, GradientBoostingClassifierFunction, KNeighborsClassifierFunction, + LightGBMClassifierFunction, RandomForestClassifierFunction, SVMClassifierFunction, ) @@ -16,4 +17,5 @@ "KNeighborsClassifierFunction", "RandomForestClassifierFunction", "SVMClassifierFunction", + "LightGBMClassifierFunction", ] diff --git a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/test_functions/__init__.py b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/test_functions/__init__.py index a7f8ef7..30384a7 100644 --- a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/test_functions/__init__.py +++ b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/test_functions/__init__.py @@ -6,6 +6,7 @@ from .decision_tree_classifier import DecisionTreeClassifierFunction from .gradient_boosting_classifier import GradientBoostingClassifierFunction from .k_neighbors_classifier import KNeighborsClassifierFunction +from .lightgbm_classifier import LightGBMClassifierFunction from .random_forest_classifier import RandomForestClassifierFunction from .svm_classifier import SVMClassifierFunction @@ -15,4 +16,5 @@ "KNeighborsClassifierFunction", "RandomForestClassifierFunction", "SVMClassifierFunction", + "LightGBMClassifierFunction", ] diff --git a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/test_functions/lightgbm_classifier.py b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/test_functions/lightgbm_classifier.py new file mode 100644 index 0000000..224b6fe --- /dev/null +++ b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/classification/test_functions/lightgbm_classifier.py @@ -0,0 +1,139 @@ +"""LightGBM Classifier test function with surrogate support.""" + +from typing import Any, Dict, List, Optional + +import numpy as np +from lightgbm import LGBMClassifier +from sklearn.model_selection import cross_val_score + +from surfaces.modifiers import BaseModifier + +from .._base_classification import BaseClassification +from ..datasets import DATASETS + + +class LightGBMClassifierFunction(BaseClassification): + """LightGBM Classifier test function. + + Parameters + ---------- + dataset : str, default="digits" + Dataset to use. One of: "digits", "iris", "wine", "breast_cancer", "covtype". + cv : int, default=5 + Number of cross-validation folds. + use_surrogate : bool, default=False + If True, use pre-trained surrogate for fast evaluation. + """ + + name = "LightGBM Classifier Function" + _name_ = "lightgbm_classifier" + __name__ = "LightGBMClassifierFunction" + + available_datasets = list(DATASETS.keys()) + available_cv = [2, 3, 5, 10] + + para_names = [ + "n_estimators", + "learning_rate", + "num_leaves", + "max_depth", + "min_child_samples", + "subsample", + "colsample_bytree", + "reg_alpha", + "reg_lambda", + ] + + # Hp search space defaults + + n_estimators_default = list(np.arange(10, 300, 10)) + learning_rate_default = [1e-3, 1e-1, 0.5, 1.0] + num_leaves_default = list(range(10, 100, 5)) + max_depth_default = list(range(2, 20, 1)) + min_child_samples_default = list(range(5, 100, 5)) + subsample_default = list(np.arange(0.1, 1.01, 0.1)) + colsample_bytree_default = list(np.arange(0.1, 1.01, 0.1)) + reg_alpha_default = [0, 0.001, 0.01, 0.1, 1, 10] + reg_lambda_default = [0, 0.001, 0.01, 0.1, 10] + + # Function sheet for doc + latex_formula = r"\text{CV-Accuracy} = f(\text{n\_estimators}, \text{learning\_rate}, \dots)" + tagline = ( + "Cross-validated accuracy of a LightGBM classifier. " + "Gradient boosting with tree-based learning." + ) + + def __init__( + self, + dataset: str = "digits", + cv: int = 5, + objective: str = "maximize", + modifiers: Optional[List[BaseModifier]] = None, + memory: bool = False, + collect_data: bool = True, + callbacks=None, + catch_errors=None, + use_surrogate: bool = False, + ): + if dataset not in DATASETS: + raise ValueError(f"Unknown dataset '{dataset}'. Available: {self.available_datasets}") + if cv not in self.available_cv: + raise ValueError(f"Invalid cv={cv}. Available: {self.available_cv}") + + self.dataset = dataset + self.cv = cv + self._dataset_loader = DATASETS[dataset] + + super().__init__( + objective=objective, + modifiers=modifiers, + memory=memory, + collect_data=collect_data, + callbacks=callbacks, + catch_errors=catch_errors, + use_surrogate=use_surrogate, + ) + + @property + def search_space(self) -> Dict[str, Any]: + return { + "n_estimators": self.n_estimators_default, + "learning_rate": self.learning_rate_default, + "num_leaves": self.num_leaves_default, + "max_depth": self.max_depth_default, + "min_child_samples": self.min_child_samples_default, + "subsample": self.subsample_default, + "colsample_bytree": self.colsample_bytree_default, + "reg_alpha": self.reg_alpha_default, + "reg_lambda": self.reg_lambda_default, + } + + def _create_objective_function(self) -> None: + """ + Creates the objective function closure with fixed data + """ + X, y = self._dataset_loader() + cv = self.cv + + def objective(params: Dict[str, Any]) -> float: + clf = LGBMClassifier( + n_estimators=params["n_estimators"], + learning_rate=params["learning_rate"], + num_leaves=params["num_leaves"], + max_depth=params["max_depth"], + min_child_samples=params["min_child_samples"], + subsample=params["subsample"], + colsample_bytree=params["colsample_bytree"], + reg_alpha=params["reg_alpha"], + reg_lambda=params["reg_lambda"], + random_state=42, + n_jobs=-1, + verbose=-1, + ) + scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy") + return scores.mean() + + self.pure_objective_function = objective + + def _get_surrogate_params(self, params: Dict[str, Any]) -> Dict[str, Any]: + return {**params, "dataset": self.dataset, "cv": self.cv} diff --git a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/__init__.py b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/__init__.py index c7ab14b..5cc7a17 100644 --- a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/__init__.py +++ b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/__init__.py @@ -6,6 +6,7 @@ DecisionTreeRegressorFunction, GradientBoostingRegressorFunction, KNeighborsRegressorFunction, + LightGBMRegressorFunction, RandomForestRegressorFunction, SVMRegressorFunction, ) @@ -16,4 +17,5 @@ "KNeighborsRegressorFunction", "RandomForestRegressorFunction", "SVMRegressorFunction", + "LightGBMRegressorFunction", ] diff --git a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/test_functions/__init__.py b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/test_functions/__init__.py index 26e2d86..b01df6a 100644 --- a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/test_functions/__init__.py +++ b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/test_functions/__init__.py @@ -6,6 +6,7 @@ from .decision_tree_regressor import DecisionTreeRegressorFunction from .gradient_boosting_regressor import GradientBoostingRegressorFunction from .k_neighbors_regressor import KNeighborsRegressorFunction +from .lightgbm_regressor import LightGBMRegressorFunction from .random_forest_regressor import RandomForestRegressorFunction from .svm_regressor import SVMRegressorFunction @@ -15,4 +16,5 @@ "KNeighborsRegressorFunction", "RandomForestRegressorFunction", "SVMRegressorFunction", + "LightGBMRegressorFunction", ] diff --git a/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/test_functions/lightgbm_regressor.py b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/test_functions/lightgbm_regressor.py new file mode 100644 index 0000000..0f03158 --- /dev/null +++ b/src/surfaces/test_functions/machine_learning/hyperparameter_optimization/tabular/regression/test_functions/lightgbm_regressor.py @@ -0,0 +1,131 @@ +from typing import Any, Dict, List, Optional + +import numpy as np +from lightgbm import LGBMRegressor +from sklearn.model_selection import cross_val_score + +from surfaces.modifiers import BaseModifier + +from .._base_regression import BaseRegression +from ..datasets import DATASETS + + +class LightGBMRegressorFunction(BaseRegression): + """LightGBM Regressor test function with surrogate support. + + Parameters + ---------- + dataset : str, default="diabetes" + Dataset to use. One of: "diabetes", "california", "friedman1", "friedman2", "linear" + cv : int, default=5 + Number of cross-validation folds. + use_surrogate : bool, default=False + If True, use pre-trained surrogate for fast evaluation. + """ + + name = "LightGBM Regressor Function" + _name_ = "lightgbm_regressor" + __name__ = "LightGBMRegressorFunction" + + available_datasets = list(DATASETS.keys()) + available_cv = [2, 3, 5, 10] + + para_names = [ + "n_estimators", + "learning_rate", + "num_leaves", + "max_depth", + "min_child_samples", + "subsample", + "colsample_bytree", + "reg_alpha", + "reg_lambda", + ] + + # Hp search space defaults + n_estimators_default = list(np.arange(10, 300, 10)) + learning_rate_default = [1e-3, 1e-1, 0.5, 1.0] + num_leaves_default = list(range(10, 100, 5)) + max_depth_default = list(range(2, 20, 1)) + min_child_samples_default = list(range(5, 100, 5)) + subsample_default = list(np.arange(0.1, 1.01, 0.1)) + colsample_bytree_default = list(np.arange(0.1, 1.01, 0.1)) + reg_alpha_default = [0, 0.001, 0.01, 0.1, 1, 10] + reg_lambda_default = [0, 0.001, 0.01, 0.1, 10] + + def __init__( + self, + dataset: str = "diabetes", + cv: int = 5, + objective: str = "maximize", + modifiers: Optional[List[BaseModifier]] = None, + memory: bool = False, + collect_data: bool = True, + callbacks=None, + catch_errors=None, + use_surrogate: bool = False, + ): + if dataset not in DATASETS: + raise ValueError(f"Unknown dataset '{dataset}'. Available: {self.available_datasets}") + if cv not in self.available_cv: + raise ValueError(f"Invalid cv ={cv}. Available: {self.available_cv}") + + self.dataset = dataset + self.cv = cv + self._dataset_loader = DATASETS[dataset] + + super().__init__( + objective=objective, + modifiers=modifiers, + memory=memory, + collect_data=collect_data, + callbacks=callbacks, + catch_errors=catch_errors, + use_surrogate=use_surrogate, + ) + + @property + def search_space(self) -> Dict[str, Any]: + return { + "n_estimators": self.n_estimators_default, + "learning_rate": self.learning_rate_default, + "num_leaves": self.num_leaves_default, + "max_depth": self.max_depth_default, + "min_child_samples": self.min_child_samples_default, + "subsample": self.subsample_default, + "colsample_bytree": self.colsample_bytree_default, + "reg_alpha": self.reg_alpha_default, + "reg_lambda": self.reg_lambda_default, + } + + def _create_objective_function(self) -> None: + """ + Creates the objective function closure with fixed data. + """ + + X, y = self._dataset_loader() + cv = self.cv + + def objective(params: Dict[str, Any]) -> float: + reg = LGBMRegressor( + n_estimators=params["n_estimators"], + learning_rate=params["learning_rate"], + num_leaves=params["num_leaves"], + max_depth=params["max_depth"], + min_child_samples=params["min_child_samples"], + subsample=params["subsample"], + colsample_bytree=params["colsample_bytree"], + reg_alpha=params["reg_alpha"], + reg_lambda=params["reg_lambda"], + random_state=42, + n_jobs=-1, + verbose=-1, + ) + + scores = cross_val_score(reg, X, y, cv=cv, scoring="r2") + return scores.mean() + + self.pure_objective_function = objective + + def _get_surrogate_params(self, params: Dict[str, Any]) -> Dict[str, Any]: + return {**params, "dataset": self.dataset, "cv": self.cv}