diff --git a/src/nf/nf_conv1d_layer.f90 b/src/nf/nf_conv1d_layer.f90 index c39b11fc..65f82347 100644 --- a/src/nf/nf_conv1d_layer.f90 +++ b/src/nf/nf_conv1d_layer.f90 @@ -31,9 +31,10 @@ module nf_conv1d_layer procedure :: forward procedure :: backward - procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -97,14 +98,25 @@ module function get_params(self) result(params) !! Parameters to get end function get_params - module function get_gradients(self) result(gradients) - !! Return the gradients of this layer. - !! The gradients are ordered as weights first, biases second. + module subroutine get_params_ptr(self, w_ptr, b_ptr) + !! Return pointers to the parameters (weights and biases) of this layer. class(conv1d_layer), intent(in), target :: self !! A `conv1d_layer` instance - real, allocatable :: gradients(:) - !! Gradients to get - end function get_gradients + real, pointer, intent(out) :: w_ptr(:) + !! Pointer to the kernel weights (flattened) + real, pointer, intent(out) :: b_ptr(:) + !! Pointer to the biases + end subroutine get_params_ptr + + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + !! Return pointers to the gradients of this layer. + class(conv1d_layer), intent(in), target :: self + !! A `conv1d_layer` instance + real, pointer, intent(out) :: dw_ptr(:) + !! Pointer to the kernel weight gradients (flattened) + real, pointer, intent(out) :: db_ptr(:) + !! Pointer to the bias gradients + end subroutine get_gradients_ptr module subroutine set_params(self, params) !! Set the parameters of the layer. diff --git a/src/nf/nf_conv1d_layer_submodule.f90 b/src/nf/nf_conv1d_layer_submodule.f90 index 5404b9c7..98856689 100644 --- a/src/nf/nf_conv1d_layer_submodule.f90 +++ b/src/nf/nf_conv1d_layer_submodule.f90 @@ -152,13 +152,21 @@ module function get_params(self) result(params) params = [ w_, self % biases] end function get_params - module function get_gradients(self) result(gradients) + module subroutine get_params_ptr(self, w_ptr, b_ptr) class(conv1d_layer), intent(in), target :: self - real, allocatable :: gradients(:) - real, pointer :: dw_(:) => null() - dw_(1:size(self % dw)) => self % dw - gradients = [ dw_, self % db ] - end function get_gradients + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr => self % biases + end subroutine get_params_ptr + + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(conv1d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr module subroutine set_params(self, params) class(conv1d_layer), intent(in out) :: self diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90 index 4b79376e..d6c92c31 100644 --- a/src/nf/nf_conv2d_layer.f90 +++ b/src/nf/nf_conv2d_layer.f90 @@ -32,9 +32,10 @@ module nf_conv2d_layer procedure :: forward procedure :: backward - procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -98,14 +99,25 @@ module function get_params(self) result(params) !! Parameters to get end function get_params - module function get_gradients(self) result(gradients) - !! Return the gradients of this layer. - !! The gradients are ordered as weights first, biases second. + module subroutine get_params_ptr(self, w_ptr, b_ptr) + !! Return pointers to the parameters (weights and biases) of this layer. class(conv2d_layer), intent(in), target :: self !! A `conv2d_layer` instance - real, allocatable :: gradients(:) - !! Gradients to get - end function get_gradients + real, pointer, intent(out) :: w_ptr(:) + !! Pointer to the kernel weights (flattened) + real, pointer, intent(out) :: b_ptr(:) + !! Pointer to the biases + end subroutine get_params_ptr + + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + !! Return pointers to the gradients of this layer. + class(conv2d_layer), intent(in), target :: self + !! A `conv2d_layer` instance + real, pointer, intent(out) :: dw_ptr(:) + !! Pointer to the kernel weight gradients (flattened) + real, pointer, intent(out) :: db_ptr(:) + !! Pointer to the bias gradients + end subroutine get_gradients_ptr module subroutine set_params(self, params) !! Set the parameters of the layer. diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90 index 45a2c1da..56b398fc 100644 --- a/src/nf/nf_conv2d_layer_submodule.f90 +++ b/src/nf/nf_conv2d_layer_submodule.f90 @@ -204,21 +204,23 @@ module function get_params(self) result(params) end function get_params - - module function get_gradients(self) result(gradients) + + module subroutine get_params_ptr(self, w_ptr, b_ptr) class(conv2d_layer), intent(in), target :: self - real, allocatable :: gradients(:) - - real, pointer :: dw_(:) => null() + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr => self % biases + end subroutine get_params_ptr - dw_(1:size(self % dw)) => self % dw - gradients = [ & - dw_, & - self % db & - ] - - end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(conv2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index 862f4cdf..e93a57ca 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -33,9 +33,10 @@ module nf_dense_layer procedure :: backward procedure :: forward - procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -96,14 +97,17 @@ module function get_params(self) result(params) !! Parameters of this layer end function get_params - module function get_gradients(self) result(gradients) - !! Return the gradients of this layer. - !! The gradients are ordered as weights first, biases second. + module subroutine get_params_ptr(self, w_ptr, b_ptr) class(dense_layer), intent(in), target :: self - !! Dense layer instance - real, allocatable :: gradients(:) - !! Gradients of this layer - end function get_gradients + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + end subroutine get_params_ptr + + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(dense_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + end subroutine get_gradients_ptr module subroutine set_params(self, params) !! Set the parameters of this layer. diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index a424cf9c..c2f7e236 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -77,20 +77,22 @@ module function get_params(self) result(params) end function get_params - module function get_gradients(self) result(gradients) + module subroutine get_params_ptr(self, w_ptr, b_ptr) class(dense_layer), intent(in), target :: self - real, allocatable :: gradients(:) + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % weights)) => self % weights + b_ptr => self % biases + end subroutine get_params_ptr - real, pointer :: dw_(:) => null() - dw_(1:size(self % dw)) => self % dw - - gradients = [ & - dw_, & - self % db & - ] - - end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(dense_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90 index 517622b0..79569845 100644 --- a/src/nf/nf_layer.f90 +++ b/src/nf/nf_layer.f90 @@ -22,13 +22,13 @@ module nf_layer integer, allocatable :: layer_shape(:) integer, allocatable :: input_layer_shape(:) logical :: initialized = .false. + class(optimizer_base_type), allocatable :: optimizer contains procedure :: forward procedure :: get_num_params procedure :: get_params - procedure :: get_gradients procedure :: set_params procedure :: init procedure :: print_info @@ -160,14 +160,6 @@ module function get_params(self) result(params) !! Parameters of this layer end function get_params - module function get_gradients(self) result(gradients) - !! Returns the gradients of this layer. - class(layer), intent(in) :: self - !! Layer instance - real, allocatable :: gradients(:) - !! Gradients of this layer - end function get_gradients - module subroutine set_params(self, params) !! Returns the parameters of this layer. class(layer), intent(in out) :: self diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index eebedaa9..778d227a 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -682,50 +682,6 @@ module function get_params(self) result(params) end function get_params - module function get_gradients(self) result(gradients) - class(layer), intent(in) :: self - real, allocatable :: gradients(:) - - select type (this_layer => self % p) - type is (input1d_layer) - ! No gradients to get. - type is (input2d_layer) - ! No gradients to get. - type is (input3d_layer) - ! No gradients to get. - type is (dense_layer) - gradients = this_layer % get_gradients() - type is (dropout_layer) - ! No gradients to get. - type is (conv1d_layer) - gradients = this_layer % get_gradients() - type is (conv2d_layer) - gradients = this_layer % get_gradients() - type is (locally_connected1d_layer) - gradients = this_layer % get_gradients() - type is (maxpool1d_layer) - ! No gradients to get. - type is (maxpool2d_layer) - ! No gradients to get. - type is (flatten_layer) - ! No gradients to get. - type is (reshape2d_layer) - ! No parameters to get. - type is (reshape3d_layer) - ! No gradients to get. - type is (linear2d_layer) - gradients = this_layer % get_gradients() - type is (self_attention_layer) - gradients = this_layer % get_gradients() - type is (embedding_layer) - gradients = this_layer % get_gradients() - type is (layernorm_layer) - gradients = this_layer % get_gradients() - class default - error stop 'Unknown layer type.' - end select - - end function get_gradients module subroutine set_params(self, params) class(layer), intent(in out) :: self diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90 index 36ef56f0..7bffc06a 100644 --- a/src/nf/nf_layernorm.f90 +++ b/src/nf/nf_layernorm.f90 @@ -38,7 +38,9 @@ module nf_layernorm_layer procedure :: init procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: set_params end type layernorm_layer @@ -78,12 +80,24 @@ module function get_params(self) result(params) end function get_params + module subroutine get_params_ptr(self, g_ptr, b_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: g_ptr(:), b_ptr(:) + end subroutine get_params_ptr + + module function get_gradients(self) result(gradients) class(layernorm_layer), intent(in), target :: self real, allocatable :: gradients(:) end function get_gradients + module subroutine get_gradients_ptr(self, dg_ptr, db_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: dg_ptr(:), db_ptr(:) + end subroutine get_gradients_ptr + + module subroutine set_params(self, params) class(layernorm_layer), intent(in out) :: self real, intent(in), target :: params(:) diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90 index 4eaa4382..5e357b33 100644 --- a/src/nf/nf_layernorm_submodule.f90 +++ b/src/nf/nf_layernorm_submodule.f90 @@ -112,25 +112,31 @@ end function get_num_params module function get_params(self) result(params) class(layernorm_layer), intent(in), target :: self real, allocatable :: params(:) + params = [self % gamma, self % beta] + end function get_params - params = [ & - self % gamma, & - self % beta & - ] - end function get_params + module subroutine get_params_ptr(self, g_ptr, b_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: g_ptr(:), b_ptr(:) + g_ptr => self % gamma + b_ptr => self % beta + end subroutine get_params_ptr module function get_gradients(self) result(gradients) class(layernorm_layer), intent(in), target :: self real, allocatable :: gradients(:) + gradients = [self % d_gamma, self % d_beta] + end function get_gradients - gradients = [ & - self % d_gamma, & - self % d_beta & - ] - end function get_gradients + module subroutine get_gradients_ptr(self, dg_ptr, db_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: dg_ptr(:), db_ptr(:) + dg_ptr => self % d_gamma + db_ptr => self % d_beta + end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_linear2d_layer.f90 b/src/nf/nf_linear2d_layer.f90 index f785a14c..f2c8fd16 100644 --- a/src/nf/nf_linear2d_layer.f90 +++ b/src/nf/nf_linear2d_layer.f90 @@ -25,7 +25,9 @@ module nf_linear2d_layer procedure :: init procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: set_params end type linear2d_layer @@ -64,11 +66,21 @@ module function get_params(self) result(params) real, allocatable :: params(:) end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:), b_ptr(:) + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) class(linear2d_layer), intent(in), target :: self real, allocatable :: gradients(:) end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:), db_ptr(:) + end subroutine get_gradients_ptr + module subroutine set_params(self, params) class(linear2d_layer), intent(in out) :: self real, intent(in), target :: params(:) diff --git a/src/nf/nf_linear2d_layer_submodule.f90 b/src/nf/nf_linear2d_layer_submodule.f90 index 0dfe7e27..513527f0 100644 --- a/src/nf/nf_linear2d_layer_submodule.f90 +++ b/src/nf/nf_linear2d_layer_submodule.f90 @@ -82,33 +82,35 @@ end function get_num_params module function get_params(self) result(params) class(linear2d_layer), intent(in), target :: self real, allocatable :: params(:) - real, pointer :: w_(:) => null() + w_(1: size(self % weights)) => self % weights + params = [w_, self % biases] + end function get_params - w_(1: product(shape(self % weights))) => self % weights - - params = [ & - w_, & - self % biases & - ] - end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:), b_ptr(:) + w_ptr(1:size(self % weights)) => self % weights + b_ptr => self % biases + end subroutine get_params_ptr module function get_gradients(self) result(gradients) class(linear2d_layer), intent(in), target :: self real, allocatable :: gradients(:) - real, pointer :: dw_(:) => null() + dw_(1:size(self % dw)) => self % dw + gradients = [dw_, self % db] + end function get_gradients - dw_(1: product(shape(self % dw))) => self % dw - - gradients = [ & - dw_, & - self % db & - ] - end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:), db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_locally_connected1d_layer.f90 b/src/nf/nf_locally_connected1d_layer.f90 index beca76d5..6fea2c5c 100644 --- a/src/nf/nf_locally_connected1d_layer.f90 +++ b/src/nf/nf_locally_connected1d_layer.f90 @@ -32,8 +32,10 @@ module nf_locally_connected1d_layer procedure :: forward procedure :: backward procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -97,6 +99,12 @@ module function get_params(self) result(params) !! Parameters to get end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) !! Return the gradients of this layer. !! The gradients are ordered as weights first, biases second. @@ -106,6 +114,12 @@ module function get_gradients(self) result(gradients) !! Gradients to get end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + end subroutine get_gradients_ptr + module subroutine set_params(self, params) !! Set the parameters of the layer. class(locally_connected1d_layer), intent(in out) :: self diff --git a/src/nf/nf_locally_connected1d_layer_submodule.f90 b/src/nf/nf_locally_connected1d_layer_submodule.f90 index 053c520b..fa6110d5 100644 --- a/src/nf/nf_locally_connected1d_layer_submodule.f90 +++ b/src/nf/nf_locally_connected1d_layer_submodule.f90 @@ -128,12 +128,28 @@ module function get_params(self) result(params) params = [self % kernel, self % biases] end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr(1:size(self % biases)) => self % biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) class(locally_connected1d_layer), intent(in), target :: self real, allocatable :: gradients(:) gradients = [self % dw, self % db] end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr(1:size(self % db)) => self % db + end subroutine get_gradients_ptr + module subroutine set_params(self, params) class(locally_connected1d_layer), intent(in out) :: self real, intent(in) :: params(:) diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90 index 2bd7ce8c..ac165adf 100644 --- a/src/nf/nf_network.f90 +++ b/src/nf/nf_network.f90 @@ -21,7 +21,6 @@ module nf_network contains procedure :: backward - procedure :: get_gradients procedure :: get_num_params procedure :: get_params procedure :: print_info @@ -216,7 +215,6 @@ module integer function get_num_params(self) !! Network instance end function get_num_params - module function get_params(self) result(params) !! Get the network parameters (weights and biases). class(network), intent(in) :: self @@ -225,13 +223,6 @@ module function get_params(self) result(params) !! Network parameters to get end function get_params - module function get_gradients(self) result(gradients) - class(network), intent(in) :: self - !! Network instance - real, allocatable :: gradients(:) - !! Network gradients to set - end function get_gradients - module subroutine set_params(self, params) !! Set the network parameters (weights and biases). class(network), intent(in out) :: self diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index d8f5ff50..d550f264 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -524,25 +524,6 @@ module function get_params(self) result(params) end function get_params - module function get_gradients(self) result(gradients) - class(network), intent(in) :: self - real, allocatable :: gradients(:) - integer :: n, nstart, nend - - allocate(gradients(self % get_num_params())) - - nstart = 1 - do n = 1, size(self % layers) - - if (self % layers(n) % get_num_params() < 1) cycle - - nend = nstart + self % layers(n) % get_num_params() - 1 - gradients(nstart:nend) = self % layers(n) % get_gradients() - nstart = nend + 1 - end do - - end function get_gradients - module subroutine set_params(self, params) class(network), intent(in out) :: self @@ -597,11 +578,23 @@ module subroutine train(self, input_data, output_data, batch_size, & ! If not provided, we default to SGD with its default settings. if (present(optimizer)) then self % optimizer = optimizer + + do n = 1, size(self % layers) + self % layers(n) % optimizer = optimizer + end do + else self % optimizer = sgd() + + do n = 1, size(self % layers) + self % layers(n) % optimizer = sgd() + end do + end if - call self % optimizer % init(self % get_num_params()) + do n = 1, size(self % layers) + call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) + end do ! Passing the loss instance is optional. ! If not provided, we default to quadratic(). @@ -649,6 +642,7 @@ module subroutine update(self, optimizer, batch_size) integer, intent(in), optional :: batch_size integer :: batch_size_ real, allocatable :: params(:) + real, pointer :: weights(:), biases(:), dw(:), db(:) integer :: n ! Passing the optimizer instance is optional. If not provided, and if the @@ -661,10 +655,24 @@ module subroutine update(self, optimizer, batch_size) if (.not. allocated(self % optimizer)) then if (present(optimizer)) then self % optimizer = optimizer + + do n = 1, size(self % layers) + self % layers(n) % optimizer = optimizer + end do + else self % optimizer = sgd() + + do n = 1, size(self % layers) + self % layers(n) % optimizer = sgd() + end do + end if - call self % optimizer % init(self % get_num_params()) + + do n = 1, size(self % layers) + call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) + end do + end if if (present(batch_size)) then @@ -693,25 +701,50 @@ module subroutine update(self, optimizer, batch_size) end do #endif - params = self % get_params() - call self % optimizer % minimize(params, self % get_gradients() / batch_size_) - call self % set_params(params) - - ! Flush network gradients to zero. do n = 2, size(self % layers) select type(this_layer => self % layers(n) % p) type is(dense_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 - type is(conv2d_layer) + type is(conv1d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 - type is(conv1d_layer) + type is(conv2d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(locally_connected1d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) + this_layer % dw = 0 + this_layer % db = 0 + type is(linear2d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 + type is(layernorm_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) + this_layer % d_gamma = 0 + this_layer % d_beta = 0 end select end do diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index c64cefed..9a6b1e1f 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -44,6 +44,7 @@ end subroutine minimize real :: momentum = 0 logical :: nesterov = .false. real, allocatable, private :: velocity(:) + integer, private :: start_index = 1 contains procedure :: init => init_sgd procedure :: minimize => minimize_sgd @@ -59,6 +60,7 @@ end subroutine minimize real :: decay_rate = 0.9 real :: epsilon = 1e-8 real, allocatable, private :: rms_gradient(:) + integer, private :: start_index = 1 contains procedure :: init => init_rmsprop procedure :: minimize => minimize_rmsprop @@ -82,6 +84,7 @@ end subroutine minimize real :: weight_decay_decoupled = 0 ! decoupled weight decay regularization (AdamW) real, allocatable, private :: m(:), v(:) integer, private :: t = 0 + integer, private :: start_index = 1 contains procedure :: init => init_adam procedure :: minimize => minimize_adam @@ -99,6 +102,7 @@ end subroutine minimize real :: learning_rate_decay = 0 real, allocatable, private :: sum_squared_gradient(:) integer, private :: t = 0 + integer, private :: start_index = 1 contains procedure :: init => init_adagrad procedure :: minimize => minimize_adagrad @@ -121,19 +125,38 @@ pure subroutine minimize_sgd(self, param, gradient) !! update rule. class(sgd), intent(inout) :: self real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(in) :: gradient(:) ! Always the same size as param + integer :: end_index if (self % momentum > 0) then + + ! end_index is part of the bookkeeping for updating velocity because each + ! batch update makes two calls to minimize, one for the weights and one for + ! the biases. + ! We use start_index and end_index to update the appropriate sections + ! of the velocity array. + end_index = self % start_index + size(param) - 1 + ! Apply momentum update - self % velocity = self % momentum * self % velocity & + self % velocity(self % start_index:end_index) = & + self % momentum * self % velocity(self % start_index:end_index) & - self % learning_rate * gradient if (self % nesterov) then ! Apply Nesterov update - param = param + self % momentum * self % velocity & + param = param + self % momentum * self % velocity(self % start_index:end_index) & - self % learning_rate * gradient else - param = param + self % velocity + param = param + self % velocity(self % start_index:end_index) + end if + + if (end_index < size(param)) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 end if + else ! Apply regular update param = param - self % learning_rate * gradient @@ -157,14 +180,27 @@ pure subroutine minimize_rmsprop(self, param, gradient) class(rmsprop), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) + integer :: end_index + + end_index = self % start_index + size(param) - 1 ! Compute the RMS of the gradient using the RMSProp rule - self % rms_gradient = self % decay_rate * self % rms_gradient & + self % rms_gradient(self % start_index:end_index) = & + self % decay_rate * self % rms_gradient(self % start_index:end_index) & + (1 - self % decay_rate) * gradient**2 ! Update the network parameters based on the new RMS of the gradient param = param - self % learning_rate & - / sqrt(self % rms_gradient + self % epsilon) * gradient + / sqrt(self % rms_gradient(self % start_index:end_index) + self % epsilon) & + * gradient + + if (end_index < size(param)) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 + end if end subroutine minimize_rmsprop @@ -185,20 +221,27 @@ pure subroutine minimize_adam(self, param, gradient) class(adam), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) + integer :: end_index + + end_index = self % start_index + size(param) - 1 self % t = self % t + 1 ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adam. associate(g => gradient + self % weight_decay_l2 * param) - self % m = self % beta1 * self % m + (1 - self % beta1) * g - self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 + self % m(self % start_index:end_index) = & + self % beta1 * self % m(self % start_index:end_index) & + + (1 - self % beta1) * g + self % v(self % start_index:end_index) = & + self % beta2 * self % v(self % start_index:end_index) & + + (1 - self % beta2) * g**2 end associate ! Compute bias-corrected first and second moment estimates. associate( & - m_hat => self % m / (1 - self % beta1**self % t), & - v_hat => self % v / (1 - self % beta2**self % t) & + m_hat => self % m(self % start_index:end_index) / (1 - self % beta1**self % t), & + v_hat => self % v(self % start_index:end_index) / (1 - self % beta2**self % t) & ) ! Update parameters. @@ -208,6 +251,14 @@ pure subroutine minimize_adam(self, param, gradient) end associate + if (end_index < size(param)) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 + end if + end subroutine minimize_adam @@ -226,6 +277,9 @@ pure subroutine minimize_adagrad(self, param, gradient) class(adagrad), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) + integer :: end_index + + end_index = self % start_index + size(param) - 1 ! Update the current time step self % t = self % t + 1 @@ -239,13 +293,23 @@ pure subroutine minimize_adagrad(self, param, gradient) / (1 + (self % t - 1) * self % learning_rate_decay) & ) - self % sum_squared_gradient = self % sum_squared_gradient + g**2 + self % sum_squared_gradient(self % start_index:end_index) = & + self % sum_squared_gradient(self % start_index:end_index) + g**2 - param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) & + param = param - learning_rate * g & + / (sqrt(self % sum_squared_gradient(self % start_index:end_index)) & + self % epsilon) end associate + if (end_index < size(param)) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 + end if + end subroutine minimize_adagrad -end module nf_optimizers +end module nf_optimizers \ No newline at end of file diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 index 6a897575..9e8bfccf 100644 --- a/test/test_layernorm.f90 +++ b/test/test_layernorm.f90 @@ -27,14 +27,14 @@ program test_layernorm_instance end if contains - function allclose(x, y) result(res) - real, intent(in) :: x(:) - real, intent(in) :: y(:) - logical :: res - res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y))) + logical function allclose(x, y) result(res) + real, intent(in) :: x(:), y(:) + !res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y))) + res = all(abs(x - y) <= 1e-05) end function allclose + subroutine test_layernorm_forward(layernorm_instance, input, ok) type(layernorm_layer), intent(in out) :: layernorm_instance real, intent(in out) :: input(:, :) @@ -61,6 +61,7 @@ subroutine test_layernorm_forward(layernorm_instance, input, ok) end if end subroutine test_layernorm_forward + subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok) type(layernorm_layer), intent(in out) :: layernorm_instance real, intent(in out) :: input(:, :) @@ -103,6 +104,7 @@ subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok) end if end subroutine test_layernorm_backward + subroutine test_layernorm_gradients(input, gradient, ok) real, intent(in out) :: input(:, :) real, intent(in out) :: gradient(:, :) @@ -152,6 +154,7 @@ subroutine test_layernorm_gradients(input, gradient, ok) end if end subroutine test_layernorm_gradients + subroutine test_layernorm_integration(ok) logical, intent(in out) :: ok @@ -160,13 +163,13 @@ subroutine test_layernorm_integration(ok) real :: y(6) = [0.7, 0.2, 0.1, 0.1, 0.01, 0.9] real :: tolerance = 0.1 integer :: epoch - integer :: epochs = 10000 + integer, parameter :: num_epochs = 100000 - net = network([& - input(2, 3),& - linear2d(3),& - layernorm(),& - flatten()& + net = network([ & + input(2, 3), & + linear2d(3), & + layernorm(), & + flatten() & ]) ! Kaiming weights to achieve semblance of convergance @@ -177,17 +180,18 @@ subroutine test_layernorm_integration(ok) l % biases = 0.2 end select - do epoch = 1, epochs + do epoch = 1, num_epochs call net % forward(x) call net % backward(y) call net % update(optimizer=sgd(learning_rate=0.001)) if (all(abs(net % predict(x) - y) < tolerance)) exit end do - if (.not. epoch <= epochs) then + if (.not. epoch <= num_epochs) then write(stderr, '(a)') & 'linear2d + layernorm should converge in simple training.. failed' ok = .false. end if end subroutine test_layernorm_integration + end program test_layernorm_instance