From 0f6b961bf9f2ba96d2f085136383304fa6531a7e Mon Sep 17 00:00:00 2001
From: Michael McLeod <michaelmcleod@ucl.ac.uk>
Date: Mon, 23 Oct 2023 21:29:50 +0100
Subject: [PATCH 1/3] Initial gradient descent draft

---
 cpp/sopt/gradient_descent.h | 106 ++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 cpp/sopt/gradient_descent.h
diff --git a/cpp/sopt/gradient_descent.h b/cpp/sopt/gradient_descent.h
new file mode 100644
index 000000000..6ea8cfc0e
--- /dev/null
+++ b/cpp/sopt/gradient_descent.h
@@ -0,0 +1,106 @@
+#ifndef SOPT_GRADIENT_DESCENT_H
+#define SOPT_GRADIENT_DESCENT_H
+
+#include <functional>
+#include "sopt/linear_transform.h"
+#include "sopt/types.h"
+
+namespace sopt::algorithm {
+
+//! \brief Pure gradient descent algorithm
+//! \details Requires \f$\grad f, \grad g\f$ be analytically defined.
+//! \f$x_{n+1} = x_n + \alpha R(\grad f(x_n, y)) + \lambda \grad(g(\mu x_n))\f$
+//! \param f_gradient: Gradient function for f, where f is usually a likelihood. Takes two arguments(x, y). 
+//! \param g_gradient: Gradient function for g, where g is usually a prior / regulator. Takes one argument (x). 
+//! \param lambda: multiplier for g gradient function 
+//! \param Lipschitz_f: Lipschitz constant of function f (used to calculated alpha) 
+//! \param Lipschitz_g: Lipschitz constant of function g (used to calculated alpha) 
+//! \param mu: Scaling parameter for vector inside g. Also used to calculate alpha
+template <typename SCALAR>
+class GradientDescent 
+{
+ public:
+  using F_Gradient =
+      typename std::function<Vector<SCALAR>(const Vector<SCALAR> &, const Vector<SCALAR> &)>;
+  using G_Gradient = typename std::function<Vector<SCALAR>(const Vector<SCALAR> &)>;
+  using REAL = typename real_type<SCALAR>::type;
+
+  template <typename DERIVED>
+  GradientDescent(F_Gradient const &f_gradient, G_Gradient const &g_gradient,
+                  Eigen::MatrixBase<DERIVED> const &target,
+                  LinearTransform<Vector<SCALAR>> const &measurement_operator, REAL mu, REAL lambda,
+                  REAL Lipschitz_f, REAL Lipschitz_g, REAL delta_converged)
+      : mu(mu),
+        Lipschitz_f(Lipschitz_f),
+        Lipschitz_g(Lipschitz_g),
+        lambda(lambda),
+        Phi(measurement_operator),
+        threshold_delta(delta_converged),
+        f_gradient(f_gradient),
+        g_gradient(g_gradient)
+  {
+    alpha = 0.98 / (Lipschitz_f + mu * lambda * Lipschitz_g);
+  }
+
+ protected:
+  LinearTransform<Vector<SCALAR>> Phi;
+  F_Gradient f_gradient;
+  G_Gradient g_gradient;
+  REAL alpha;
+  REAL lambda;
+  REAL mu;
+  REAL Lipschitz_f;
+  REAL Lipschitz_g;
+  Vector<SCALAR> target;
+  REAL threshold_delta;
+  Vector<SCALAR> delta_x;
+  REAL theta_now;
+  REAL theta_next;
+  Vector<SCALAR> x_prev;
+  uint max_iterations;
+
+  void iteration_step(Vector<SCALAR> &x, Vector<SCALAR> &z)
+  {
+    // Should be able to make this better to avoid copies
+    x_prev = x;
+
+    delta_x = f_gradient(z, target).real();
+    delta_x += lambda * g_gradient(mu * z);
+    delta_x *= alpha;
+
+    theta_next = 0.5 * (1 + sqrt(1 + 4*theta_now*theta_now));
+
+    x = z - delta_x;
+    z = x + (theta_now - 1)/ theta_next * (x - x_prev);
+  }
+
+  bool converged(Vector<SCALAR> &x)
+  {
+    return (delta_x.norm() / x.norm()) < threshold_delta; 
+  }
+
+  void operator()(Vector<SCALAR> &x, Vector<SCALAR> &residual)
+  {
+    Vector<SCALAR> z = x;
+    bool converged = False;
+    for (uint i = 0; i < max_iterations; i++)
+    {
+      iteration_step(x, z);
+      
+      converged = converged(x);
+      if(converged) break;
+    }
+
+    if(converged)
+    {
+      // TODO: Log some relevant stuff about the convergence.
+    }
+
+    residual = (Phi * x) - target;
+  }
+
+};
+
+}  // namespace sopt::algorithm
+
+#endif  // SOPT_GRADIENT_DESCENT_H
\ No newline at end of file

From 7e9a2125869bac021714179d956c5ffd235b6532 Mon Sep 17 00:00:00 2001
From: Michael McLeod <michaelmcleod@ucl.ac.uk>
Date: Wed, 25 Oct 2023 20:34:39 +0100
Subject: [PATCH 2/3] Add initial gradient descent with basic test

---
 cpp/sopt/CMakeLists.txt       |  1 +
 cpp/sopt/gradient_descent.h   | 95 ++++++++++++++++++++---------------
 cpp/tests/CMakeLists.txt      |  1 +
 cpp/tests/gradient_descent.cc | 79 +++++++++++++++++++++++++++++
 4 files changed, 136 insertions(+), 40 deletions(-)
 create mode 100644 cpp/tests/gradient_descent.cc

diff --git a/cpp/sopt/CMakeLists.txt b/cpp/sopt/CMakeLists.txt
index e782d6ea4..d2aa5980f 100644
--- a/cpp/sopt/CMakeLists.txt
+++ b/cpp/sopt/CMakeLists.txt
@@ -2,6 +2,7 @@
 set(headers
   bisection_method.h chained_operators.h credible_region.h
   imaging_padmm.h logging.disabled.h
+  gradient_descent.h
   forward_backward.h imaging_forward_backward.h
   g_proximal.h l1_g_proximal.h joint_map.h
   imaging_primal_dual.h primal_dual.h
diff --git a/cpp/sopt/gradient_descent.h b/cpp/sopt/gradient_descent.h
index 6ea8cfc0e..bd6b38658 100644
--- a/cpp/sopt/gradient_descent.h
+++ b/cpp/sopt/gradient_descent.h
@@ -7,6 +7,18 @@
 
 namespace sopt::algorithm {
 
+  //! Values indicating how the algorithm ran
+  template <typename SCALAR>
+  struct AlgorithmResults {
+    //! Number of iterations
+    t_uint niters;
+    //! Wether convergence was achieved
+    bool good;
+    //! the residual from the last iteration
+    Vector<SCALAR> residual;
+    Vector<SCALAR> result;
+  };
+
 //! \brief Pure gradient descent algorithm
 //! \details Requires \f$\grad f, \grad g\f$ be analytically defined.
 //! \f$x_{n+1} = x_n + \alpha R(\grad f(x_n, y)) + \lambda \grad(g(\mu x_n))\f$
@@ -25,21 +37,44 @@ class GradientDescent
   using G_Gradient = typename std::function<Vector<SCALAR>(const Vector<SCALAR> &)>;
   using REAL = typename real_type<SCALAR>::type;
 
-  template <typename DERIVED>
-  GradientDescent(F_Gradient const &f_gradient, G_Gradient const &g_gradient,
-                  Eigen::MatrixBase<DERIVED> const &target,
-                  LinearTransform<Vector<SCALAR>> const &measurement_operator, REAL mu, REAL lambda,
-                  REAL Lipschitz_f, REAL Lipschitz_g, REAL delta_converged)
-      : mu(mu),
-        Lipschitz_f(Lipschitz_f),
-        Lipschitz_g(Lipschitz_g),
-        lambda(lambda),
-        Phi(measurement_operator),
-        threshold_delta(delta_converged),
+  GradientDescent(F_Gradient const &f_gradient,
+                  G_Gradient const &g_gradient,
+                  Vector<SCALAR> const &target)
+      : Phi(linear_transform_identity<SCALAR>()),
         f_gradient(f_gradient),
-        g_gradient(g_gradient)
+        g_gradient(g_gradient),
+        target(target)
   {
-    alpha = 0.98 / (Lipschitz_f + mu * lambda * Lipschitz_g);
+    //alpha = 0.98 / (Lipschitz_f + mu * lambda * Lipschitz_g);
+    alpha = 0.1;
+  }
+
+  AlgorithmResults<SCALAR> operator()(Vector<SCALAR> &x)
+  {
+    Vector<SCALAR> z = x;
+    bool converged = false;
+    uint iterations = 0;
+    while ((!converged) && (iterations < max_iterations))
+    {
+      iteration_step(x, z);
+      
+      converged = is_converged(x);
+
+      ++iterations;
+    }
+
+    if(converged)
+    {
+      // TODO: Log some relevant stuff about the convergence.
+    }
+
+    AlgorithmResults<SCALAR> results;
+    results.good = converged;
+    results.niters = iterations;
+    results.residual = (Phi * x) - target;
+    results.result = z;
+
+    return results;
   }
 
  protected:
@@ -47,17 +82,17 @@ class GradientDescent
   F_Gradient f_gradient;
   G_Gradient g_gradient;
   REAL alpha;
-  REAL lambda;
-  REAL mu;
-  REAL Lipschitz_f;
-  REAL Lipschitz_g;
+  REAL lambda = 1;
+  REAL mu = 1;
+  REAL Lipschitz_f = 1;
+  REAL Lipschitz_g = 1;
   Vector<SCALAR> target;
-  REAL threshold_delta;
+  REAL threshold_delta = 1e-4;
   Vector<SCALAR> delta_x;
   REAL theta_now;
   REAL theta_next;
   Vector<SCALAR> x_prev;
-  uint max_iterations;
+  uint max_iterations = 200;
 
   void iteration_step(Vector<SCALAR> &x, Vector<SCALAR> &z)
   {
@@ -74,31 +109,11 @@ class GradientDescent
     z = x + (theta_now - 1)/ theta_next * (x - x_prev);
   }
 
-  bool converged(Vector<SCALAR> &x)
+  bool is_converged(Vector<SCALAR> &x)
   {
     return (delta_x.norm() / x.norm()) < threshold_delta; 
   }
 
-  void operator()(Vector<SCALAR> &x, Vector<SCALAR> &residual)
-  {
-    Vector<SCALAR> z = x;
-    bool converged = False;
-    for (uint i = 0; i < max_iterations; i++)
-    {
-      iteration_step(x, z);
-      
-      converged = converged(x);
-      if(converged) break;
-    }
-
-    if(converged)
-    {
-      // TODO: Log some relevant stuff about the convergence.
-    }
-
-    residual = (Phi * x) - target;
-  }
-
 };
 
 }  // namespace sopt::algorithm
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 61d06db59..d7c84ce69 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -24,6 +24,7 @@ add_catch_test(chained_operators LIBRARIES sopt SEED ${RAND_SEED})
 add_catch_test(conjugate_gradient LIBRARIES sopt SEED ${RAND_SEED})
 add_catch_test(credible_region LIBRARIES sopt SEED ${RAND_SEED})
 add_catch_test(forward_backward LIBRARIES sopt tools_for_tests SEED ${RAND_SEED})
+add_catch_test(gradient_descent LIBRARIES sopt tools_for_tests SEED ${RAND_SEED})
 add_catch_test(gradient_operator LIBRARIES sopt tools_for_tests SEED ${RAND_SEED})
 add_catch_test(inpainting LIBRARIES sopt tools_for_tests SEED ${RAND_SEED})
 add_catch_test(linear_transform LIBRARIES sopt SEED ${RAND_SEED})
diff --git a/cpp/tests/gradient_descent.cc b/cpp/tests/gradient_descent.cc
new file mode 100644
index 000000000..22f980e73
--- /dev/null
+++ b/cpp/tests/gradient_descent.cc
@@ -0,0 +1,79 @@
+#include <catch2/catch_all.hpp>
+#include "sopt/gradient_descent.h"
+#include <random>
+
+uint constexpr N = 10;
+
+TEST_CASE("Gradient Descent with flat prior", "[GradDescent]")
+{
+    using namespace sopt;
+    
+    const Vector<float> target = Vector<float>::Random(N);
+    float const sigma = 0.5;
+    float const gamma = 0.1;
+    uint const max_iterations = 100;
+
+    auto const grad_likelihood = [](const Vector<float>&x, const Vector<float>&y){return (x-y);};
+    auto const grad_prior = [](const Vector<float> &x){return 0*x;};
+
+    Vector<float> init_guess = Vector<float>::Random(N);
+
+    auto Phi = linear_transform_identity<float>();
+
+    algorithm::GradientDescent<float> gd(grad_likelihood, grad_prior, target);
+    
+    algorithm::AlgorithmResults<float> results = gd(init_guess);
+
+    CHECK(results.good);
+    CHECK(results.result.isApprox(target, 0.1));
+}
+
+TEST_CASE("Gradient Descent with smoothness prior", "[GradDescent]")
+{
+    using namespace sopt;
+    std::mt19937_64 rng;
+    std::uniform_real_distribution<float> noise(0, 0.2);
+
+    Vector<float> data(N);
+    for(uint i = 0; i < N; i++)
+    {
+        data(i) = sin((M_PI/(N-1))*i) + noise(rng);
+    }
+
+    Vector<float> perfect(N);
+    for(uint i = 0; i < N; i++)
+    {
+        perfect(i) = sin((M_PI/(N-1))*i);
+    }
+
+    float const sigma = 0.5;
+    float const gamma = 0.1;
+    uint const max_iterations = 100;
+
+    auto const grad_likelihood = [](const Vector<float>&x, const Vector<float>&y){return (x-y);};
+    auto const grad_prior = [](const Vector<float> &x)
+    {
+        Vector<float> grad(x.size());
+        grad(0) = x(0);
+        grad(x.size()-1) = x(x.size()-1);
+        for(uint i = 1; i < x.size()-1; i++)
+        {
+            // Push values to be roughly in line with neighbours
+            // Hand wavey kind of smoothness prior
+            grad(i) = x(i) - 0.5*(x(i-1) + x(i+1));
+        }
+        return grad;
+    };
+
+    Vector<float> init_guess = Vector<float>::Random(N);
+
+    auto Phi = linear_transform_identity<float>();
+
+    algorithm::GradientDescent<float> gd(grad_likelihood, grad_prior, data);
+    
+    algorithm::AlgorithmResults<float> results = gd(init_guess);
+
+    CHECK(results.good);
+    CHECK(results.result.isApprox(perfect, 0.1));
+    CHECK((results.result - perfect).squaredNorm() < (data-perfect).squaredNorm());
+}
\ No newline at end of file

From 4081422cc6d9be0201324fa5b88c4acbee1ba064 Mon Sep 17 00:00:00 2001
From: Michael McLeod <michaelmcleod@ucl.ac.uk>
Date: Wed, 25 Oct 2023 20:44:48 +0100
Subject: [PATCH 3/3] Add additional params into constructor

---
 cpp/sopt/gradient_descent.h   | 17 ++++++++++++-----
 cpp/tests/gradient_descent.cc |  4 ++--
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/cpp/sopt/gradient_descent.h b/cpp/sopt/gradient_descent.h
index bd6b38658..1f382773d 100644
--- a/cpp/sopt/gradient_descent.h
+++ b/cpp/sopt/gradient_descent.h
@@ -39,14 +39,21 @@ class GradientDescent
 
   GradientDescent(F_Gradient const &f_gradient,
                   G_Gradient const &g_gradient,
-                  Vector<SCALAR> const &target)
+                  Vector<SCALAR> const &target,
+                  REAL const threshold,
+                  REAL const Lipschitz_f = 1,
+                  REAL const Lipschitz_g = 1,
+                  REAL const mu = 1,
+                  REAL const lambda = 1)
       : Phi(linear_transform_identity<SCALAR>()),
         f_gradient(f_gradient),
         g_gradient(g_gradient),
-        target(target)
+        target(target),
+        Lipschitz_f(Lipschitz_f),
+        Lipschitz_g(Lipschitz_g),
+        threshold_delta(threshold)
   {
-    //alpha = 0.98 / (Lipschitz_f + mu * lambda * Lipschitz_g);
-    alpha = 0.1;
+    alpha = 0.98 / (Lipschitz_f + mu * lambda * Lipschitz_g);
   }
 
   AlgorithmResults<SCALAR> operator()(Vector<SCALAR> &x)
@@ -87,7 +94,7 @@ class GradientDescent
   REAL Lipschitz_f = 1;
   REAL Lipschitz_g = 1;
   Vector<SCALAR> target;
-  REAL threshold_delta = 1e-4;
+  REAL threshold_delta;
   Vector<SCALAR> delta_x;
   REAL theta_now;
   REAL theta_next;
diff --git a/cpp/tests/gradient_descent.cc b/cpp/tests/gradient_descent.cc
index 22f980e73..7b47a3004 100644
--- a/cpp/tests/gradient_descent.cc
+++ b/cpp/tests/gradient_descent.cc
@@ -20,7 +20,7 @@ TEST_CASE("Gradient Descent with flat prior", "[GradDescent]")
 
     auto Phi = linear_transform_identity<float>();
 
-    algorithm::GradientDescent<float> gd(grad_likelihood, grad_prior, target);
+    algorithm::GradientDescent<float> gd(grad_likelihood, grad_prior, target, 1e-4);
     
     algorithm::AlgorithmResults<float> results = gd(init_guess);
 
@@ -69,7 +69,7 @@ TEST_CASE("Gradient Descent with smoothness prior", "[GradDescent]")
 
     auto Phi = linear_transform_identity<float>();
 
-    algorithm::GradientDescent<float> gd(grad_likelihood, grad_prior, data);
+    algorithm::GradientDescent<float> gd(grad_likelihood, grad_prior, data, 1e-4);
     
     algorithm::AlgorithmResults<float> results = gd(init_guess);