Get weights and weight gradients as 1d

milancurcic · milancurcic · commit 9d68828f7e29 · 2025-05-30T13:47:28.000-04:00
diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
@@ -100,8 +100,8 @@ end function get_params
 
     module subroutine get_params_ptr(self, w_ptr, b_ptr)
       class(dense_layer), intent(in), target :: self
-      real, pointer :: w_ptr(:,:)
-      real, pointer :: b_ptr(:)
+      real, pointer, intent(out) :: w_ptr(:)
+      real, pointer, intent(out) :: b_ptr(:)
     end subroutine get_params_ptr
 
     module function get_gradients(self) result(gradients)
@@ -115,8 +115,8 @@ end function get_gradients
 
     module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
       class(dense_layer), intent(in), target :: self
-      real, pointer :: dw_ptr(:,:)
-      real, pointer :: db_ptr(:)
+      real, pointer, intent(out) :: dw_ptr(:)
+      real, pointer, intent(out) :: db_ptr(:)
     end subroutine get_gradients_ptr
 
     module subroutine set_params(self, params)
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
@@ -79,9 +79,9 @@ end function get_params
 
   module subroutine get_params_ptr(self, w_ptr, b_ptr)
     class(dense_layer), intent(in), target :: self
-    real, pointer :: w_ptr(:,:)
-    real, pointer :: b_ptr(:)
-    w_ptr => self % weights
+    real, pointer, intent(out) :: w_ptr(:)
+    real, pointer, intent(out) :: b_ptr(:)
+    w_ptr(1:size(self % weights)) => self % weights
     b_ptr => self % biases
   end subroutine get_params_ptr
 
@@ -104,9 +104,9 @@ end function get_gradients
 
   module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
     class(dense_layer), intent(in), target :: self
-    real, pointer :: dw_ptr(:,:)
-    real, pointer :: db_ptr(:)
-    dw_ptr => self % dw
+    real, pointer, intent(out) :: dw_ptr(:)
+    real, pointer, intent(out) :: db_ptr(:)
+    dw_ptr(1:size(self % dw)) => self % dw
     db_ptr => self % db
   end subroutine get_gradients_ptr
 
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
@@ -649,7 +649,7 @@ module subroutine update(self, optimizer, batch_size)
     integer, intent(in), optional :: batch_size
     integer :: batch_size_
     real, allocatable :: params(:)
-    real, pointer :: weights(:,:), biases(:), dw(:,:), db(:)
+    real, pointer :: weights(:), biases(:), dw(:), db(:)
     integer :: n
 
     ! Passing the optimizer instance is optional. If not provided, and if the
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
@@ -19,9 +19,7 @@ module nf_optimizers
     real :: learning_rate = 0.01
   contains
     procedure(init), deferred :: init
-    procedure(minimize_1d), deferred :: minimize_1d
-    procedure(minimize_2d), deferred :: minimize_2d
-    generic :: minimize => minimize_1d, minimize_2d
+    procedure(minimize), deferred :: minimize
   end type optimizer_base_type
 
   abstract interface
@@ -32,19 +30,12 @@ impure elemental subroutine init(self, num_params)
       integer, intent(in) :: num_params
     end subroutine init
 
-    pure subroutine minimize_1d(self, param, gradient)
+    pure subroutine minimize(self, param, gradient)
       import :: optimizer_base_type
       class(optimizer_base_type), intent(inout) :: self
       real, intent(inout) :: param(:)
       real, intent(in) :: gradient(:)
-    end subroutine minimize_1d
-
-    pure subroutine minimize_2d(self, param, gradient)
-      import :: optimizer_base_type
-      class(optimizer_base_type), intent(inout) :: self
-      real, intent(inout) :: param(:,:)
-      real, intent(in) :: gradient(:,:)
-    end subroutine minimize_2d
+    end subroutine minimize
 
   end interface
 
@@ -55,8 +46,7 @@ end subroutine minimize_2d
     real, allocatable, private :: velocity(:)
   contains
     procedure :: init => init_sgd
-    procedure :: minimize_1d => minimize_sgd_1d
-    procedure :: minimize_2d => minimize_sgd_2d
+    procedure :: minimize => minimize_sgd
   end type sgd
 
   type, extends(optimizer_base_type) :: rmsprop
@@ -71,8 +61,7 @@ end subroutine minimize_2d
     real, allocatable, private :: rms_gradient(:)
   contains
     procedure :: init => init_rmsprop
-    procedure :: minimize_1d => minimize_rmsprop_1d
-    procedure :: minimize_2d => minimize_rmsprop_2d
+    procedure :: minimize => minimize_rmsprop
   end type rmsprop
 
   type, extends(optimizer_base_type) :: adam
@@ -95,8 +84,7 @@ end subroutine minimize_2d
     integer, private :: t = 0
   contains
     procedure :: init => init_adam
-    procedure :: minimize_1d => minimize_adam_1d
-    procedure :: minimize_2d => minimize_adam_2d
+    procedure :: minimize => minimize_adam
   end type adam
 
   type, extends(optimizer_base_type) :: adagrad
@@ -113,8 +101,7 @@ end subroutine minimize_2d
     integer, private :: t = 0
   contains
     procedure :: init => init_adagrad
-    procedure :: minimize_1d => minimize_adagrad_1d
-    procedure :: minimize_2d => minimize_adagrad_2d
+    procedure :: minimize => minimize_adagrad
   end type adagrad
 
 contains
@@ -129,7 +116,7 @@ impure elemental subroutine init_sgd(self, num_params)
   end subroutine init_sgd
 
 
-  pure subroutine minimize_sgd_1d(self, param, gradient)
+  pure subroutine minimize_sgd(self, param, gradient)
     !! Concrete implementation of a stochastic gradient descent optimizer
     !! update rule.
     class(sgd), intent(inout) :: self
@@ -152,33 +139,7 @@ pure subroutine minimize_sgd_1d(self, param, gradient)
       param = param - self % learning_rate * gradient
     end if
 
-  end subroutine minimize_sgd_1d
-
-
-  pure subroutine minimize_sgd_2d(self, param, gradient)
-    !! Concrete implementation of a stochastic gradient descent optimizer
-    !! update rule for 2D arrays.
-    class(sgd), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    if (self % momentum > 0) then
-      ! Apply momentum update
-      self % velocity = self % momentum * self % velocity &
-        - self % learning_rate * reshape(gradient, [size(gradient)])
-      if (self % nesterov) then
-        ! Apply Nesterov update
-        param = param + reshape(self % momentum * self % velocity &
-          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
-      else
-        param = param + reshape(self % velocity, shape(param))
-      end if
-    else
-      ! Apply regular update
-      param = param - self % learning_rate * gradient
-    end if
-
-  end subroutine minimize_sgd_2d
+  end subroutine minimize_sgd
 
 
   impure elemental subroutine init_rmsprop(self, num_params)
@@ -191,7 +152,7 @@ impure elemental subroutine init_rmsprop(self, num_params)
   end subroutine init_rmsprop
 
 
-  pure subroutine minimize_rmsprop_1d(self, param, gradient)
+  pure subroutine minimize_rmsprop(self, param, gradient)
     !! Concrete implementation of a RMSProp optimizer update rule.
     class(rmsprop), intent(inout) :: self
     real, intent(inout) :: param(:)
@@ -205,24 +166,7 @@ pure subroutine minimize_rmsprop_1d(self, param, gradient)
     param = param - self % learning_rate &
       / sqrt(self % rms_gradient + self % epsilon) * gradient
 
-  end subroutine minimize_rmsprop_1d
-
-
-  pure subroutine minimize_rmsprop_2d(self, param, gradient)
-    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
-    class(rmsprop), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    ! Compute the RMS of the gradient using the RMSProp rule
-    self % rms_gradient = self % decay_rate * self % rms_gradient &
-      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
-
-    ! Update the network parameters based on the new RMS of the gradient
-    param = param - self % learning_rate &
-      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
-
-  end subroutine minimize_rmsprop_2d
+  end subroutine minimize_rmsprop
 
 
   impure elemental subroutine init_adam(self, num_params)
@@ -236,7 +180,7 @@ impure elemental subroutine init_adam(self, num_params)
   end subroutine init_adam
 
 
-  pure subroutine minimize_adam_1d(self, param, gradient)
+  pure subroutine minimize_adam(self, param, gradient)
     !! Concrete implementation of an Adam optimizer update rule.
     class(adam), intent(inout) :: self
     real, intent(inout) :: param(:)
@@ -264,38 +208,7 @@ pure subroutine minimize_adam_1d(self, param, gradient)
 
     end associate
 
-  end subroutine minimize_adam_1d
-
-
-  pure subroutine minimize_adam_2d(self, param, gradient)
-    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
-    class(adam), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    self % t = self % t + 1
-
-    ! If weight_decay_l2 > 0, use L2 regularization;
-    ! otherwise, default to regular Adam.
-    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
-      self % m = self % beta1 * self % m + (1 - self % beta1) * g
-      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
-    end associate
-
-    ! Compute bias-corrected first and second moment estimates.
-    associate( &
-      m_hat => self % m / (1 - self % beta1**self % t), &
-      v_hat => self % v / (1 - self % beta2**self % t) &
-    )
-
-    ! Update parameters.
-    param = param &
-      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
-      - self % learning_rate * self % weight_decay_decoupled * param
-
-    end associate
-
-  end subroutine minimize_adam_2d
+  end subroutine minimize_adam
 
 
   impure elemental subroutine init_adagrad(self, num_params)
@@ -308,7 +221,7 @@ impure elemental subroutine init_adagrad(self, num_params)
   end subroutine init_adagrad
 
 
-  pure subroutine minimize_adagrad_1d(self, param, gradient)
+  pure subroutine minimize_adagrad(self, param, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule.
     class(adagrad), intent(inout) :: self
     real, intent(inout) :: param(:)
@@ -333,34 +246,6 @@ pure subroutine minimize_adagrad_1d(self, param, gradient)
 
     end associate
 
-  end subroutine minimize_adagrad_1d
-
-
-  pure subroutine minimize_adagrad_2d(self, param, gradient)
-    !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays.
-    class(adagrad), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    ! Update the current time step
-    self % t = self % t + 1
-
-    associate( &
-      ! If weight_decay_l2 > 0, use L2 regularization;
-      ! otherwise, default to regular Adagrad.
-      g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]), &
-      ! Amortize the learning rate as function of the current time step.
-      learning_rate => self % learning_rate &
-        / (1 + (self % t - 1) * self % learning_rate_decay) &
-    )
-
-      self % sum_squared_gradient = self % sum_squared_gradient + g**2
-
-      param = param - learning_rate * reshape(g / (sqrt(self % sum_squared_gradient) &
-        + self % epsilon), shape(param))
-
-    end associate
-
-  end subroutine minimize_adagrad_2d
+  end subroutine minimize_adagrad
 
 end module nf_optimizers