|
| 1 | +# Author: Simon Blanke |
| 2 | +# Email: simon.blanke@yahoo.com |
| 3 | +# License: MIT License |
| 4 | +"""CMA-ES using covariance matrix adaptation for continuous domains.""" |
| 5 | + |
| 6 | +from typing import Literal |
| 7 | + |
| 8 | +from .._init_utils import get_default_initialize |
| 9 | +from ..optimizers import ( |
| 10 | + CMAESOptimizer as _CMAESOptimizer, |
| 11 | +) |
| 12 | +from ..search import Search |
| 13 | + |
| 14 | + |
| 15 | +class CMAESOptimizer(_CMAESOptimizer, Search): |
| 16 | + """ |
| 17 | + Evolutionary optimizer using covariance matrix adaptation. |
| 18 | +
|
| 19 | + CMA-ES (Covariance Matrix Adaptation Evolution Strategy) is a |
| 20 | + state-of-the-art evolutionary algorithm for difficult continuous |
| 21 | + optimization problems. It adapts a full covariance matrix to learn |
| 22 | + the correlation structure of the fitness landscape, enabling |
| 23 | + efficient search even when parameters are strongly correlated or |
| 24 | + have different sensitivities. |
| 25 | +
|
| 26 | + The algorithm maintains a multivariate normal distribution and |
| 27 | + iteratively: |
| 28 | +
|
| 29 | + 1. Samples ``population`` candidate solutions from the distribution |
| 30 | + 2. Evaluates and ranks them by fitness |
| 31 | + 3. Updates the distribution mean toward the best solutions |
| 32 | + 4. Adapts the covariance matrix using evolution paths |
| 33 | + 5. Controls the global step size via cumulative step-size adaptation |
| 34 | +
|
| 35 | + CMA-ES is considered the gold standard for continuous black-box |
| 36 | + optimization. For mixed search spaces (discrete, categorical), |
| 37 | + this implementation samples in continuous space and rounds to the |
| 38 | + nearest valid value, which is a pragmatic compromise. |
| 39 | +
|
| 40 | + The algorithm is well-suited for: |
| 41 | +
|
| 42 | + - Continuous optimization with correlated parameters |
| 43 | + - Problems where parameter sensitivities differ strongly |
| 44 | + - Moderate dimensionality (up to ~100 dimensions) |
| 45 | + - Multi-modal landscapes (especially with IPOP restart) |
| 46 | +
|
| 47 | + Parameters |
| 48 | + ---------- |
| 49 | + search_space : dict[str, list] |
| 50 | + The search space to explore, defined as a dictionary mapping parameter |
| 51 | + names to arrays of possible values. |
| 52 | +
|
| 53 | + Each key is a parameter name (string), and each value is a numpy array |
| 54 | + or list of discrete values that the parameter can take. The optimizer |
| 55 | + will only evaluate positions that are on this discrete grid. |
| 56 | +
|
| 57 | + Example: A 2D search space with 100 points per dimension:: |
| 58 | +
|
| 59 | + search_space = { |
| 60 | + "x": np.linspace(-10, 10, 100), |
| 61 | + "y": np.linspace(-10, 10, 100), |
| 62 | + } |
| 63 | +
|
| 64 | + The resolution of each dimension (number of points in the array) |
| 65 | + directly affects optimization quality and speed. More points give |
| 66 | + finer resolution but increase the search space size exponentially. |
| 67 | + initialize : dict[str, int], default={"vertices": 4, "random": 2} |
| 68 | + Strategy for generating initial positions before the main optimization |
| 69 | + loop begins. Initialization samples are evaluated first, and the best |
| 70 | + one becomes the starting point (mean) for the CMA-ES distribution. |
| 71 | +
|
| 72 | + Supported keys: |
| 73 | +
|
| 74 | + - ``"grid"``: ``int`` -- Number of positions on a regular grid. |
| 75 | + - ``"vertices"``: ``int`` -- Number of corner/edge positions of the |
| 76 | + search space. |
| 77 | + - ``"random"``: ``int`` -- Number of uniformly random positions. |
| 78 | + - ``"warm_start"``: ``list[dict]`` -- Specific positions to evaluate, |
| 79 | + each as a dict mapping parameter names to values. |
| 80 | +
|
| 81 | + Multiple strategies can be combined:: |
| 82 | +
|
| 83 | + initialize = {"vertices": 4, "random": 10} |
| 84 | + initialize = {"warm_start": [{"x": 0.5, "y": 1.0}], "random": 5} |
| 85 | +
|
| 86 | + More initialization samples improve the starting point but consume |
| 87 | + iterations from ``n_iter``. For expensive objectives, a few targeted |
| 88 | + warm-start points are often more efficient than many random samples. |
| 89 | + constraints : list[callable], default=[] |
| 90 | + A list of constraint functions that restrict the search space. Each |
| 91 | + constraint is a callable that receives a parameter dictionary and |
| 92 | + returns ``True`` if the position is valid, ``False`` if it should |
| 93 | + be rejected. |
| 94 | +
|
| 95 | + Rejected positions are discarded and regenerated: the optimizer |
| 96 | + resamples a new candidate position (up to 100 retries per step). |
| 97 | + During initialization, positions that violate constraints are |
| 98 | + filtered out entirely. |
| 99 | +
|
| 100 | + Example: Constrain the search to a circular region:: |
| 101 | +
|
| 102 | + def circular_constraint(para): |
| 103 | + return para["x"]**2 + para["y"]**2 <= 25 |
| 104 | +
|
| 105 | + constraints = [circular_constraint] |
| 106 | +
|
| 107 | + Multiple constraints are combined with AND logic (all must return |
| 108 | + ``True``). |
| 109 | + random_state : int or None, default=None |
| 110 | + Seed for the random number generator to ensure reproducible results. |
| 111 | +
|
| 112 | + - ``None``: Use a new random state each run (non-deterministic). |
| 113 | + - ``int``: Seed the random number generator for reproducibility. |
| 114 | +
|
| 115 | + Setting a fixed seed is recommended for debugging and benchmarking. |
| 116 | + Different seeds may lead to different optimization trajectories, |
| 117 | + especially for stochastic optimizers. |
| 118 | + rand_rest_p : float, default=0 |
| 119 | + Probability of performing a random restart instead of the normal |
| 120 | + algorithm step. At each iteration, a uniform random number is drawn; |
| 121 | + if it falls below ``rand_rest_p``, the optimizer jumps to a random |
| 122 | + position instead of following its strategy. |
| 123 | +
|
| 124 | + - ``0.0``: No random restarts (pure algorithm behavior). |
| 125 | + - ``0.01-0.05``: Light diversification, helps escape shallow local |
| 126 | + optima. |
| 127 | + - ``0.1-0.3``: Aggressive restarts, useful for highly multi-modal |
| 128 | + landscapes. |
| 129 | + - ``1.0``: Equivalent to random search. |
| 130 | + population : int or None, default=None |
| 131 | + Number of candidate solutions sampled per generation (lambda in |
| 132 | + CMA-ES notation). If ``None``, uses the standard heuristic: |
| 133 | + ``4 + floor(3 * ln(n_dimensions))``. |
| 134 | +
|
| 135 | + - ``None``: Auto-compute based on dimensionality (recommended). |
| 136 | + - ``10-20``: Small populations for fast convergence on simple |
| 137 | + problems. |
| 138 | + - ``50-100``: Large populations for better exploration on |
| 139 | + multimodal or high-dimensional problems. |
| 140 | +
|
| 141 | + Each generation requires ``population`` function evaluations, |
| 142 | + so total cost per generation scales linearly with this parameter. |
| 143 | + mu : int or None, default=None |
| 144 | + Number of best solutions selected as parents for the next |
| 145 | + generation. If ``None``, uses ``population // 2``. |
| 146 | +
|
| 147 | + - ``None``: Auto-compute as half the population (recommended). |
| 148 | + - Smaller ``mu``: Stronger selection pressure, faster convergence |
| 149 | + but higher risk of premature convergence. |
| 150 | + - Larger ``mu``: Weaker selection pressure, better exploration. |
| 151 | +
|
| 152 | + Must be less than or equal to ``population``. |
| 153 | + sigma : float, default=0.3 |
| 154 | + Initial step size as a fraction of the normalized search space |
| 155 | + range. Controls the initial spread of sampled solutions around |
| 156 | + the mean. |
| 157 | +
|
| 158 | + - ``0.1``: Conservative, tight initial sampling. |
| 159 | + - ``0.3``: Standard starting point (default). |
| 160 | + - ``0.5``: Broad initial exploration. |
| 161 | +
|
| 162 | + CMA-ES adapts sigma automatically during optimization, so the |
| 163 | + initial value is not critical. Values between 0.1 and 0.5 |
| 164 | + generally work well. |
| 165 | + ipop_restart : bool, default=False |
| 166 | + Enable IPOP (Increasing Population) restart strategy. When |
| 167 | + stagnation is detected (no improvement for many generations), |
| 168 | + the algorithm restarts with a doubled population size and a |
| 169 | + random starting point. |
| 170 | +
|
| 171 | + - ``False``: No restarts, single run (default). |
| 172 | + - ``True``: Enable IPOP restarts for better global search on |
| 173 | + multimodal landscapes. |
| 174 | +
|
| 175 | + IPOP-CMA-ES is particularly effective for problems with many |
| 176 | + local optima, as it combines the precision of CMA-ES with |
| 177 | + increasingly thorough global search. |
| 178 | +
|
| 179 | + Notes |
| 180 | + ----- |
| 181 | + CMA-ES adapts the search distribution using two evolution paths: |
| 182 | +
|
| 183 | + - **Cumulation path for sigma** (p_sigma): Controls global step size |
| 184 | + via Cumulative Step-size Adaptation (CSA). If steps are correlated |
| 185 | + (consistent direction), sigma increases; if anti-correlated |
| 186 | + (oscillating), sigma decreases. |
| 187 | + - **Cumulation path for C** (p_c): Provides the rank-one update to |
| 188 | + the covariance matrix, capturing the dominant search direction. |
| 189 | +
|
| 190 | + The covariance matrix is updated via: |
| 191 | +
|
| 192 | + - **Rank-one update**: Uses p_c to learn the principal search |
| 193 | + direction. |
| 194 | + - **Rank-mu update**: Uses all mu selected solutions to learn the |
| 195 | + local landscape shape. |
| 196 | +
|
| 197 | + For mixed search spaces (discrete/categorical dimensions), the |
| 198 | + algorithm operates in a normalized continuous space and maps back |
| 199 | + to valid values via rounding. This is a standard approach (MI-CMA-ES) |
| 200 | + that preserves the covariance adaptation while supporting non-continuous |
| 201 | + parameters. |
| 202 | +
|
| 203 | + See Also |
| 204 | + -------- |
| 205 | + EvolutionStrategyOptimizer : Simpler ES with self-adaptive sigma. |
| 206 | + DifferentialEvolutionOptimizer : DE using vector differences. |
| 207 | + ParticleSwarmOptimizer : Swarm intelligence approach. |
| 208 | +
|
| 209 | + Examples |
| 210 | + -------- |
| 211 | + >>> import numpy as np |
| 212 | + >>> from gradient_free_optimizers import CMAESOptimizer |
| 213 | +
|
| 214 | + >>> def rosenbrock(para): |
| 215 | + ... x, y = para["x"], para["y"] |
| 216 | + ... return -(100 * (y - x**2)**2 + (1 - x)**2) |
| 217 | +
|
| 218 | + >>> search_space = { |
| 219 | + ... "x": np.linspace(-5, 5, 1000), |
| 220 | + ... "y": np.linspace(-5, 5, 1000), |
| 221 | + ... } |
| 222 | +
|
| 223 | + >>> opt = CMAESOptimizer(search_space, population=20, sigma=0.3) |
| 224 | + >>> opt.search(rosenbrock, n_iter=500) |
| 225 | + """ |
| 226 | + |
| 227 | + def __init__( |
| 228 | + self, |
| 229 | + search_space: dict[str, list], |
| 230 | + initialize: dict[ |
| 231 | + Literal["grid", "vertices", "random", "warm_start"], |
| 232 | + int | list[dict], |
| 233 | + ] = None, |
| 234 | + constraints: list[callable] = None, |
| 235 | + random_state: int = None, |
| 236 | + rand_rest_p: float = 0, |
| 237 | + nth_process: int = None, |
| 238 | + population: int = None, |
| 239 | + mu: int = None, |
| 240 | + sigma: float = 0.3, |
| 241 | + ipop_restart: bool = False, |
| 242 | + ): |
| 243 | + if initialize is None: |
| 244 | + initialize = get_default_initialize() |
| 245 | + if constraints is None: |
| 246 | + constraints = [] |
| 247 | + |
| 248 | + super().__init__( |
| 249 | + search_space=search_space, |
| 250 | + initialize=initialize, |
| 251 | + constraints=constraints, |
| 252 | + random_state=random_state, |
| 253 | + rand_rest_p=rand_rest_p, |
| 254 | + nth_process=nth_process, |
| 255 | + population=population, |
| 256 | + mu=mu, |
| 257 | + sigma=sigma, |
| 258 | + ipop_restart=ipop_restart, |
| 259 | + ) |
0 commit comments