ooples
diff --git a/‎src/ReinforcementLearning/Agents/CQL/CQLAgent.cs‎
Lines changed: 78 additions & 15 deletions b/‎src/ReinforcementLearning/Agents/CQL/CQLAgent.cs‎
Lines changed: 78 additions & 15 deletions
diff --git a/‎src/ReinforcementLearning/Agents/MADDPG/MADDPGAgent.cs‎
Lines changed: 84 additions & 14 deletions b/‎src/ReinforcementLearning/Agents/MADDPG/MADDPGAgent.cs‎
Lines changed: 84 additions & 14 deletions
@@ -203,7 +203,7 @@ public override Vector<T> SelectAction(Vector<T> state, bool training = true)
         for (int i = 0; i < _options.ActionSize; i++)
         {
             var std = NumOps.Exp(logStd[i]);
-            var noise = MathHelper.GetNormalRandom<T>(_numOps.Zero, _numOps.One);
+            var noise = GetSeededNormalRandom(_numOps.Zero, _numOps.One, _random);
             var rawAction = _numOps.Add(mean[i], _numOps.Multiply(std, noise));
             action[i] = MathHelper.Tanh<T>(rawAction);
         }
@@ -269,8 +269,20 @@ private T UpdateQNetworks(List<ReplayBuffers.Experience<T>> batch)
             var q2TargetValue = q2TargetTensor.ToVector()[0];
             var minQTarget = MathHelper.Min<T>(q1TargetValue, q2TargetValue);
 
-            // Compute entropy term (simplified)
-            var entropyTerm = _numOps.Multiply(_alpha, _numOps.FromDouble(0.1));  // Simplified entropy
+            // Compute actual policy entropy from log probabilities
+            // For Gaussian policy: entropy = 0.5 * log(2 * pi * e * sigma^2)
+            var policyOutputTensor = _policyNetwork.Predict(Tensor<T>.FromVector(experience.NextState));
+            var policyOutput = policyOutputTensor.ToVector();
+            T policyEntropy = _numOps.Zero;
+            for (int entropyIdx = 0; entropyIdx < _options.ActionSize; entropyIdx++)
+            {
+                var logStd = policyOutput[_options.ActionSize + entropyIdx];
+                logStd = MathHelper.Clamp<T>(logStd, _numOps.FromDouble(-20), _numOps.FromDouble(2));
+                // Gaussian entropy: 0.5 * (1 + log(2*pi)) + log(sigma)
+                var gaussianConst = _numOps.FromDouble(0.5 * (1.0 + System.Math.Log(2.0 * System.Math.PI)));
+                policyEntropy = _numOps.Add(policyEntropy, _numOps.Add(gaussianConst, logStd));
+            }
+            var entropyTerm = _numOps.Multiply(_alpha, policyEntropy);
 
             T targetQ;
             if (experience.Done)
@@ -395,21 +407,30 @@ private T UpdatePolicy(List<ReplayBuffers.Experience<T>> batch)
             // Policy loss: -Q(s,a) + alpha * entropy (simplified)
             var policyLoss = _numOps.Negate(minQ);
 
-            totalLoss = _numOps.Add(totalLoss, _numOps.Multiply(policyLoss, policyLoss));
+            totalLoss = _numOps.Add(totalLoss, policyLoss);
 
             // Backprop through Q-network to get action gradient
             var qGradTensor = Tensor<T>.FromVector(new Vector<T>(new[] { _numOps.One }));
             var actionGradTensor = _q1Network.Backpropagate(qGradTensor);
             var actionGrad = actionGradTensor.ToVector();
 
-            // Extract action part of gradient and negate for gradient ascent (maximize Q)
+            // Compute policy gradients for both mean and log-sigma
+            // We want to MAXIMIZE Q, so negate the gradient (gradient descent becomes ascent)
+            var policyStateTensor = Tensor<T>.FromVector(experience.State);
+            var policyOutTensor = _policyNetwork.Forward(policyStateTensor);
+            var policyOut = policyOutTensor.ToVector();
+            
             var policyGrad = new Vector<T>(_options.ActionSize * 2);
-            for (int i = 0; i < _options.ActionSize; i++)
+            for (int policyGradIdx = 0; policyGradIdx < _options.ActionSize; policyGradIdx++)
             {
-                // Negate gradient for ascent on Q-value
-                policyGrad[i] = _numOps.Negate(actionGrad[_options.StateSize + i]);
-                // Set log-sigma gradients to zero (exploration is handled separately)
-                policyGrad[_options.ActionSize + i] = _numOps.Zero;
+                // Negate gradient to maximize Q-value (flip sign for gradient descent optimizer)
+                policyGrad[policyGradIdx] = _numOps.Negate(actionGrad[_options.StateSize + policyGradIdx]);
+                
+                // Compute log-sigma gradients from entropy regularization
+                // d/d(log_sigma) of entropy = 1 (from Gaussian entropy formula)
+                var logStd = policyOut[_options.ActionSize + policyGradIdx];
+                var entropyGrad = _alpha; // Gradient of entropy w.r.t. log_sigma
+                policyGrad[_options.ActionSize + policyGradIdx] = entropyGrad;
             }
 
             var policyGradTensor = Tensor<T>.FromVector(policyGrad);
@@ -429,7 +450,39 @@ private T UpdatePolicy(List<ReplayBuffers.Experience<T>> batch)
 
     private void UpdateTemperature(List<ReplayBuffers.Experience<T>> batch)
     {
-        // Simplified temperature update
+        // Temperature update using entropy target
+        // Loss: alpha * (entropy - target_entropy)
+        // Gradient: d_loss/d_log_alpha = alpha * (entropy - target_entropy)
+        
+        T avgEntropy = _numOps.Zero;
+        foreach (var experience in batch)
+        {
+            var policyOutputTensor = _policyNetwork.Predict(Tensor<T>.FromVector(experience.State));
+            var policyOutput = policyOutputTensor.ToVector();
+            
+            T entropy = _numOps.Zero;
+            for (int tempIdx = 0; tempIdx < _options.ActionSize; tempIdx++)
+            {
+                var logStd = policyOutput[_options.ActionSize + tempIdx];
+                logStd = MathHelper.Clamp<T>(logStd, _numOps.FromDouble(-20), _numOps.FromDouble(2));
+                var gaussianConst = _numOps.FromDouble(0.5 * (1.0 + System.Math.Log(2.0 * System.Math.PI)));
+                entropy = _numOps.Add(entropy, _numOps.Add(gaussianConst, logStd));
+            }
+            avgEntropy = _numOps.Add(avgEntropy, entropy);
+        }
+        avgEntropy = _numOps.Divide(avgEntropy, _numOps.FromDouble(batch.Count));
+        
+        // Target entropy: -dim(action_space)
+        var targetEntropy = _numOps.FromDouble(-_options.ActionSize);
+        var entropyGap = _numOps.Subtract(avgEntropy, targetEntropy);
+        
+        // Update log_alpha: log_alpha -= lr * alpha * entropy_gap
+        var alphaLr = _numOps.FromDouble(0.0003);
+        var alphaGrad = _numOps.Multiply(_alpha, entropyGap);
+        var alphaUpdate = _numOps.Multiply(alphaLr, alphaGrad);
+        _logAlpha = _numOps.Subtract(_logAlpha, alphaUpdate);
+        
+        // Update alpha from log_alpha
         _alpha = NumOps.Exp(_logAlpha);
     }
 
@@ -446,11 +499,11 @@ private void SoftUpdateNetwork(NeuralNetwork<T> source, NeuralNetwork<T> target)
         var oneMinusTau = _numOps.Subtract(_numOps.One, _options.TargetUpdateTau);
 
         var updatedParams = new Vector<T>(targetParams.Length);
-        for (int i = 0; i < targetParams.Length; i++)
+        for (int softUpdateIdx = 0; softUpdateIdx < targetParams.Length; softUpdateIdx++)
         {
-            var sourceContrib = _numOps.Multiply(_options.TargetUpdateTau, sourceParams[i]);
-            var targetContrib = _numOps.Multiply(oneMinusTau, targetParams[i]);
-            updatedParams[i] = _numOps.Add(sourceContrib, targetContrib);
+            var sourceContrib = _numOps.Multiply(_options.TargetUpdateTau, sourceParams[softUpdateIdx]);
+            var targetContrib = _numOps.Multiply(oneMinusTau, targetParams[softUpdateIdx]);
+            updatedParams[softUpdateIdx] = _numOps.Add(sourceContrib, targetContrib);
         }
 
         target.UpdateParameters(updatedParams);
@@ -476,6 +529,16 @@ private Vector<T> ConcatenateStateAction(Vector<T> state, Vector<T> action)
         return result;
     }
 
+    private T GetSeededNormalRandom(T mean, T stdDev, Random random)
+    {
+        // Box-Muller transform
+        double u1 = 1.0 - random.NextDouble();
+        double u2 = 1.0 - random.NextDouble();
+        double randStdNormal = Math.Sqrt(-2.0 * Math.Log(u1)) * Math.Sin(2.0 * Math.PI * u2);
+        double result = randStdNormal * Convert.ToDouble(stdDev) + Convert.ToDouble(mean);
+        return _numOps.FromDouble(result);
+    }
+
     public override Dictionary<string, T> GetMetrics()
     {
         return new Dictionary<string, T>
 
@@ -15,7 +15,7 @@ namespace AiDotNet.ReinforcementLearning.Agents.MADDPG;
 
 /// <summary>
 /// Multi-Agent Deep Deterministic Policy Gradient (MADDPG) agent.
-/// </summary>
+
 /// <typeparam name="T">The numeric type used for calculations.</typeparam>
 /// <remarks>
 /// <para>
@@ -57,9 +57,10 @@ public MADDPGAgent(MADDPGOptions<T> options, IOptimizer<T, Vector<T>, Vector<T>>
         : base(options)
     {
         _options = options ?? throw new ArgumentNullException(nameof(options));
+        // Issue #3 fix: Use configured actor learning rate for default optimizer
         _optimizer = optimizer ?? options.Optimizer ?? new AdamOptimizer<T, Vector<T>, Vector<T>>(this, new AdamOptimizerOptions<T, Vector<T>, Vector<T>>
         {
-            LearningRate = 0.001,
+            LearningRate = NumOps.ToDouble(_options.ActorLearningRate),
             Beta1 = 0.9,
             Beta2 = 0.999,
             Epsilon = 1e-8
@@ -105,7 +106,7 @@ private INeuralNetwork<T> CreateActorNetwork()
         var layers = new List<ILayer<T>>();
 
         // Input layer
-        layers.Add(new DenseLayer<T>(_options.StateSize, _options.ActorHiddenLayers.First(), (IActivationFunction<T>)new ReLUActivation<T>()));
+        layers.Add(new DenseLayer<T>(_options.StateSize, _options.ActorHiddenLayers[0], (IActivationFunction<T>)new ReLUActivation<T>()));
 
         // Hidden layers
         for (int i = 1; i < _options.ActorHiddenLayers.Count; i++)
@@ -114,6 +115,7 @@ private INeuralNetwork<T> CreateActorNetwork()
         }
 
         // Output layer with Tanh for continuous actions
+        // Issue #1 fix: DenseLayer constructor automatically applies Xavier/Glorot weight initialization
         layers.Add(new DenseLayer<T>(_options.ActorHiddenLayers.Last(), _options.ActionSize, (IActivationFunction<T>)new TanhActivation<T>()));
 
         var architecture = new NeuralNetworkArchitecture<T>(
@@ -136,7 +138,7 @@ private INeuralNetwork<T> CreateCriticNetwork()
         var layers = new List<ILayer<T>>();
 
         // Input layer
-        layers.Add(new DenseLayer<T>(inputSize, _options.CriticHiddenLayers.First(), (IActivationFunction<T>)new ReLUActivation<T>()));
+        layers.Add(new DenseLayer<T>(inputSize, _options.CriticHiddenLayers[0], (IActivationFunction<T>)new ReLUActivation<T>()));
 
         // Hidden layers
         for (int i = 1; i < _options.CriticHiddenLayers.Count; i++)
@@ -165,7 +167,7 @@ private void InitializeReplayBuffer()
 
     /// <summary>
     /// Select action for a specific agent.
-    /// </summary>
+    
     public Vector<T> SelectActionForAgent(int agentId, Vector<T> state, bool training = true)
     {
         if (agentId < 0 || agentId >= _options.NumAgents)
@@ -199,7 +201,14 @@ public override Vector<T> SelectAction(Vector<T> state, bool training = true)
 
     /// <summary>
     /// Store multi-agent experience.
-    /// </summary>
+    
+    /// <remarks>
+    /// Issue #2 fix: This method averages rewards across all agents, which works well for
+    /// cooperative scenarios but may not suit competitive or mixed-motive settings.
+    /// For competitive scenarios, consider storing per-agent rewards separately
+    /// or using a different reward aggregation strategy.
+    /// </remarks>
+    
     public void StoreMultiAgentExperience(
         List<Vector<T>> states,
         List<Vector<T>> actions,
@@ -212,7 +221,8 @@ public void StoreMultiAgentExperience(
         var jointAction = ConcatenateVectors(actions);
         var jointNextState = ConcatenateVectors(nextStates);
 
-        // Use average reward (or could be agent-specific)
+        // Use average reward (suitable for cooperative scenarios)
+        // Note: For competitive scenarios, per-agent reward tracking may be preferable
         T avgReward = NumOps.Zero;
         foreach (var reward in rewards)
         {
@@ -476,14 +486,40 @@ public override ModelMetadata<T> GetModelMetadata()
 
     public override int FeatureCount => _options.StateSize;
 
+    /// <summary>
+    /// Serializes the MADDPG agent to a byte array.
+    /// </summary>
+    /// <returns>Byte array containing the serialized agent data.</returns>
+    /// <exception cref="NotSupportedException">
+    /// MADDPG serialization is not currently supported. Use GetParameters() and SetParameters() instead.
+    /// </exception>
+    /// <remarks>
+    /// Issue #6 fix: Changed from NotImplementedException to NotSupportedException to indicate
+    /// this is a design limitation rather than incomplete implementation.
+    /// For saving/loading trained weights, use GetParameters() to extract all network weights
+    /// and SetParameters() to restore them.
+    /// </remarks>
     public override byte[] Serialize()
     {
-        throw new NotImplementedException("MADDPG serialization not yet implemented");
+        throw new NotSupportedException("MADDPG serialization is not currently supported. Use GetParameters() and SetParameters() for weight management.");
     }
 
+    /// <summary>
+    /// Deserializes a MADDPG agent from a byte array.
+    /// </summary>
+    /// <param name="data">Byte array containing the serialized agent data.</param>
+    /// <exception cref="NotSupportedException">
+    /// MADDPG deserialization is not currently supported. Use GetParameters() and SetParameters() instead.
+    /// </exception>
+    /// <remarks>
+    /// Issue #6 fix: Changed from NotImplementedException to NotSupportedException to indicate
+    /// this is a design limitation rather than incomplete implementation.
+    /// For saving/loading trained weights, use GetParameters() to extract all network weights
+    /// and SetParameters() to restore them.
+    /// </remarks>
     public override void Deserialize(byte[] data)
     {
-        throw new NotImplementedException("MADDPG deserialization not yet implemented");
+        throw new NotSupportedException("MADDPG deserialization is not currently supported. Use GetParameters() and SetParameters() for weight management.");
     }
 
     public override Vector<T> GetParameters()
@@ -546,9 +582,23 @@ public override void SetParameters(Vector<T> parameters)
         }
     }
 
+    /// <summary>
+    /// Creates a deep copy of this MADDPG agent including all trained network weights.
+    /// </summary>
+    /// <returns>A new MADDPG agent with the same configuration and trained parameters.</returns>
+    /// <remarks>
+    /// Issue #5 fix: Clone now properly copies all trained weights from actor and critic networks
+    /// using GetParameters() and SetParameters(), ensuring the cloned agent has the same learned behavior.
+    /// </remarks>
     public override IFullModel<T, Vector<T>, Vector<T>> Clone()
     {
-        return new MADDPGAgent<T>(_options, _optimizer);
+        var clonedAgent = new MADDPGAgent<T>(_options, _optimizer);
+
+        // Copy all trained parameters to the cloned agent
+        var currentParams = GetParameters();
+        clonedAgent.SetParameters(currentParams);
+
+        return clonedAgent;
     }
 
     public override Vector<T> ComputeGradients(
@@ -578,15 +628,35 @@ public override void ApplyGradients(Vector<T> gradients, T learningRate)
         SetParameters(newParams);
     }
 
+    /// <summary>
+    /// Saves the trained model to a file.
+    /// </summary>
+    /// <param name="filepath">Path to save the model.</param>
+    /// <exception cref="NotSupportedException">
+    /// MADDPG serialization is not currently supported.
+    /// </exception>
+    /// <remarks>
+    /// Issue #6 fix: SaveModel now throws NotSupportedException since Serialize() is not supported.
+    /// For saving trained weights, use GetParameters() to extract the parameter vector and save it separately.
+    /// </remarks>
     public override void SaveModel(string filepath)
     {
-        var data = Serialize();
-        System.IO.File.WriteAllBytes(filepath, data);
+        throw new NotSupportedException("MADDPG model saving is not currently supported. Use GetParameters() to extract trained weights for manual persistence.");
     }
 
+    /// <summary>
+    /// Loads a trained model from a file.
+    /// </summary>
+    /// <param name="filepath">Path to load the model from.</param>
+    /// <exception cref="NotSupportedException">
+    /// MADDPG deserialization is not currently supported.
+    /// </exception>
+    /// <remarks>
+    /// Issue #6 fix: LoadModel now throws NotSupportedException since Deserialize() is not supported.
+    /// For loading trained weights, use SetParameters() to restore a previously saved parameter vector.
+    /// </remarks>
     public override void LoadModel(string filepath)
     {
-        var data = System.IO.File.ReadAllBytes(filepath);
-        Deserialize(data);
+        throw new NotSupportedException("MADDPG model loading is not currently supported. Use SetParameters() to restore trained weights from manual persistence.");
     }
 }