Source code for grilly.optim.adam

"""
Adam Optimizer

Uses: adam-update.glsl, affect-adam.glsl
"""

from collections.abc import Iterator

import numpy as np

from .base import Optimizer


[docs]class Adam(Optimizer): """ Adam optimizer using GPU-accelerated shaders. Uses: adam-update.glsl Implements the Adam algorithm: - m = beta1 * m + (1 - beta1) * grad - v = beta2 * v + (1 - beta2) * grad^2 - m_hat = m / (1 - beta1^t) - v_hat = v / (1 - beta2^t) - param = param - lr * m_hat / (sqrt(v_hat) + eps) """
[docs] def __init__( self, params: Iterator[np.ndarray], lr: float = 1e-3, betas: tuple = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.0, use_gpu: bool = True, ): """ Initialize Adam optimizer. Args: params: Iterator of parameter arrays to optimize lr: Learning rate (default: 1e-3) betas: Coefficients for computing running averages (default: (0.9, 0.999)) eps: Term added to denominator for numerical stability (default: 1e-8) weight_decay: Weight decay (L2 penalty) (default: 0.0) use_gpu: Whether to use GPU acceleration (default: True) """ defaults = { "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, } super().__init__(params, defaults) self.use_gpu = use_gpu self._backend = None self._step_count = 0
[docs] def _get_backend(self): """Get or create backend instance""" if self._backend is None: try: from grilly import Compute self._backend = Compute() except Exception: self._backend = None return self._backend
[docs] def step(self, closure=None, gradients=None): """ Perform a single optimization step. Args: closure: Optional closure that reevaluates the model and returns loss gradients: Optional dict mapping parameter IDs to gradients. If None, tries to get gradients from param.grad attribute. """ loss = None if closure is not None: loss = closure() # Store gradients for use in parameter updates self._gradients = gradients self._step_count += 1 beta1, beta2 = self.defaults["betas"] lr = self.defaults["lr"] eps = self.defaults["eps"] weight_decay = self.defaults["weight_decay"] # Bias correction terms beta1_t = beta1**self._step_count beta2_t = beta2**self._step_count backend = self._get_backend() use_gpu = self.use_gpu and backend is not None for group in self.param_groups: for p in group["params"]: if p is None: continue param_id = id(p) state = self.state[param_id] # Initialize state if needed if len(state) == 0: state["step"] = 0 state["exp_avg"] = np.zeros_like(p, dtype=np.float32) state["exp_avg_sq"] = np.zeros_like(p, dtype=np.float32) state["step"] += 1 exp_avg = state["exp_avg"] exp_avg_sq = state["exp_avg_sq"] # Get gradients from backward pass # Gradients are stored in param.grad after calling backward() grad = None # First, try to get from gradients dict (if manually provided for compatibility) if hasattr(self, "_gradients") and self._gradients is not None: grad = self._gradients.get(param_id, None) # Then, try to get from param.grad (from backward pass) if grad is None: grad = getattr(p, "grad", None) # If still no gradient, skip this parameter if grad is None: continue # Ensure gradient is numpy array (extract from wrapper if needed) if hasattr(grad, "data"): grad = grad.data if not isinstance(grad, np.ndarray): grad = np.array(grad, dtype=np.float32) # Extract parameter data if it's wrapped if hasattr(p, "data"): p_data = p.data else: p_data = p # Ensure p_data is numpy array (handle memoryview) if not isinstance(p_data, np.ndarray): p_data = np.asarray(p_data, dtype=np.float32) # Apply weight decay if weight_decay != 0: grad = grad + weight_decay * p_data # Try GPU update if available if use_gpu and backend is not None: try: # Check if adam-update or affect-adam shader is available shaders_available = hasattr(backend, "core") and hasattr( backend.core, "shaders" ) shader_name = None if shaders_available: if "adam-update" in backend.core.shaders: shader_name = "adam-update" elif "affect-adam" in backend.core.shaders: shader_name = "affect-adam" if shader_name is not None: p_data, exp_avg, exp_avg_sq = self._adam_update_gpu( backend, p_data, grad, exp_avg, exp_avg_sq, lr, beta1, beta2, eps, beta1_t, beta2_t, shader_name=shader_name, ) # Update parameter (handle wrapper) if hasattr(p, "data"): p.data = p_data else: p[:] = p_data state["exp_avg"] = exp_avg state["exp_avg_sq"] = exp_avg_sq # Clear gradient after update if hasattr(p, "grad") and p.grad is not None: if hasattr(p, "zero_grad"): p.zero_grad() else: p.grad = None continue except Exception as e: import logging logger = logging.getLogger(__name__) logger.debug(f"GPU Adam update failed: {e}, falling back to CPU") pass # Fall back to CPU # CPU fallback exp_avg = beta1 * exp_avg + (1 - beta1) * grad exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad # Bias correction m_hat = exp_avg / (1 - beta1_t) if beta1_t < 1.0 else exp_avg v_hat = exp_avg_sq / (1 - beta2_t) if beta2_t < 1.0 else exp_avg_sq # Update parameters (in-place) p_data -= lr * m_hat / (np.sqrt(v_hat) + eps) # Update parameter (handle wrapper or direct numpy array) if hasattr(p, "data") and not isinstance(p, np.ndarray): # Parameter wrapper or custom class p.data = p_data else: # Direct numpy array - update in-place p[:] = p_data state["exp_avg"] = exp_avg state["exp_avg_sq"] = exp_avg_sq # Clear gradient after update (from backward pass) if hasattr(p, "grad") and p.grad is not None: if hasattr(p, "zero_grad"): p.zero_grad() else: p.grad = None return loss
[docs] def _adam_update_gpu( self, backend, param, grad, exp_avg, exp_avg_sq, lr, beta1, beta2, eps, beta1_t, beta2_t ): """ GPU-accelerated Adam update using adam-update.glsl shader. """ try: # Use backend's learning module if hasattr(backend, "learning") and hasattr(backend.learning, "adam_update"): param, exp_avg, exp_avg_sq = backend.learning.adam_update( weights=param, gradients=grad, moment1=exp_avg, moment2=exp_avg_sq, learning_rate=lr, beta1=beta1, beta2=beta2, epsilon=eps, beta1_t=beta1_t, beta2_t=beta2_t, clear_grad=False, ) return param, exp_avg, exp_avg_sq except Exception: # Fall back to CPU if GPU fails pass # CPU fallback exp_avg = beta1 * exp_avg + (1 - beta1) * grad exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad m_hat = exp_avg / (1 - beta1_t) if beta1_t < 1.0 else exp_avg v_hat = exp_avg_sq / (1 - beta2_t) if beta2_t < 1.0 else exp_avg_sq param -= lr * m_hat / (np.sqrt(v_hat) + eps) return param, exp_avg, exp_avg_sq
[docs]class AffectAdam(Adam): """ Affect-aware Adam optimizer. Uses: affect-adam.glsl Similar to Adam but optimized for affect/emotion processing. """
[docs] def __init__( self, params: Iterator[np.ndarray], lr: float = 1e-3, betas: tuple = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.0, use_gpu: bool = True, ): """ Initialize AffectAdam optimizer. Args are the same as Adam. """ super().__init__(params, lr, betas, eps, weight_decay, use_gpu)
[docs] def _adam_update_gpu( self, backend, param, grad, exp_avg, exp_avg_sq, lr, beta1, beta2, eps, beta1_t, beta2_t ): """ GPU-accelerated AffectAdam update using affect-adam.glsl shader. """ # TODO: Implement GPU shader dispatch when backend method is available # For now, use CPU fallback (same as Adam) return super()._adam_update_gpu( backend, param, grad, exp_avg, exp_avg_sq, lr, beta1, beta2, eps, beta1_t, beta2_t )