proroklab · karthiks1701 · May 1, 2025 · May 2, 2025 · karthiks1701 · May 1, 2025
diff --git a/README.md b/README.md
@@ -22,17 +22,7 @@ Watch the presentation video of DiCo.
 git clone -b het_control https://github.com/proroklab/VectorizedMultiAgentSimulator.git
 pip install -e VectorizedMultiAgentSimulator
 
-git clone -b het_control https://github.com/matteobettini/tensordict.git
-cd tensordict
-python setup.py develop
-cd ..
-
-git clone -b het_control https://github.com/matteobettini/rl.git
-cd rl
-python setup.py develop
-cd ..
-
-git clone -b het_control https://github.com/matteobettini/BenchMARL.git
+git clone https://github.com/matteobettini/BenchMARL.git
 pip install -e BenchMARL
 ```
 3. Install optional dependencies for logging
@@ -45,6 +35,10 @@ git clone https://github.com/proroklab/ControllingBehavioralDiversity.git
 pip install -e ControllingBehavioralDiversity
 ```
 5. Try running a script (it will ask for cuda and wandb, you can change these values in `ControllingBehavioralDiversity/het_control/conf/experiment/het_control_experiment.yaml`)
+
+6. Need to install Matplotlib
+
+
 ```
 python ControllingBehavioralDiversity/het_control/run_scripts/run_navigation_ippo.py model.desired_snd=0.1
 ```

diff --git a/het_control/conf/experiment/het_control_experiment.yaml b/het_control/conf/experiment/het_control_experiment.yaml
@@ -6,11 +6,17 @@ defaults:
 sampling_device: "cuda"
 # The device for training (e.g. cuda)
 train_device: "cuda"
+# new BenchMaRL requirements
+buffer_device: "cpu"
 
 # Whether to share the parameters of the policy within agent groups
 share_policy_params: True # This won't matter as our model ignores it
 # If an algorithm and an env support both continuous and discrete actions, what should be preferred
 prefer_continuous_actions: True
+# new BenchMaRL requirements
+collect_with_grad: False
+# new BenchMaRL requirements
+parallel_collection: False
 
 # Discount factor
 gamma: 0.9
@@ -69,6 +75,14 @@ off_policy_train_batch_size: 128
 off_policy_memory_size: 1_000_000
 # Number of random action frames to prefill the replay buffer with
 off_policy_init_random_frames: 0
+# new BenchMaRL requirements
+# whether to use priorities while sampling from the replay buffer
+off_policy_use_prioritized_replay_buffer: False
+# exponent that determines how much prioritization is used when off_policy_use_prioritized_replay_buffer = True
+# PRB reduces to random sampling when alpha=0
+off_policy_prb_alpha: 0.6
+# importance sampling negative exponent when off_policy_use_prioritized_replay_buffer = True
+off_policy_prb_beta: 0.4
 
 evaluation: True
 # Whether to render the evaluation (if rendering is available
@@ -79,6 +93,9 @@ evaluation_interval: 120_000
 evaluation_episodes: 200
 # If True, when stochastic policies are evaluated, their mode is taken, otherwise, if False, they are sampled
 evaluation_deterministic_actions: False
+# If True, seed the environment before evaluation leading to always the same evaluation env being used
+# If False, evaluation environments will be more random throughout training
+evaluation_static: False
 
 # Absolute path to the folder where the experiment will log.
 # If null, this will default to the hydra output dir (if using hydra) or to the current folder when the script is run (if not).
@@ -91,9 +108,12 @@ checkpoint_interval: 0
 
 # List of loggers to use, options are: wandb, csv, tensorboard, mflow
 loggers: [wandb]
+restore_map_location: null
+project_name: "benchmarl"
 # Create a json folder as part of the output in the format of marl-eval
 create_json: True
-
+checkpoint_at_end: False
+keep_checkpoints_num: 3
 
 
 

diff --git a/het_control/conf/sampling_iddpg_config.yaml b/het_control/conf/sampling_iddpg_config.yaml
@@ -18,4 +18,4 @@ experiment:
 hydra:
   searchpath:
    # Tells hydra to add the default benchmarl configuration to its path
-    - pkg://benchmarl/conf
+    - pkg://benchmarl/conf
diff --git a/het_control/models/het_control_mlp_empirical.py b/het_control/models/het_control_mlp_empirical.py
@@ -74,7 +74,10 @@ def __init__(
             else None
         )  # Components that maps std_dev according to scale_mapping
 
-        self.input_features = self.input_leaf_spec.shape[-1]
+        # self.input_features = self.input_leaf_spec.shape[-1]
+        self.input_features = sum(
+            [spec.shape[-1] for spec in self.input_spec.values(True, True)]
+        )
         self.output_features = self.output_leaf_spec.shape[-1]
 
         self.shared_mlp = MultiAgentMLP(
@@ -132,9 +135,12 @@ def _forward(
     ) -> TensorDictBase:
         # Gather in_key
 
-        input = tensordict.get(
-            self.in_key
-        )  # Observation tensor of shape [*batch, n_agents, n_features]
+        # input = tensordict.get(
+        #     self.in_key
+        # ) 
+        input = torch.cat([tensordict.get(in_key) for in_key in self.in_keys], dim=-1) 
+
+        # Observation tensor of shape [*batch, n_agents, n_features]
         shared_out = self.shared_mlp.forward(input)
         if agent_index is None:  # Gather outputs for all agents on the obs
             # tensor of shape [*batch, n_agents, n_actions], where the outputs
@@ -143,7 +149,9 @@ def _forward(
         else:  # Gather outputs for one agent on the obs
             # tensor of shape [*batch, n_agents, n_actions], where the outputs
             # along the n_agent dimension are taken with the same (agent_index) agent network
-            agent_out = self.agent_mlps.agent_networks[agent_index].forward(input)
+            # agent_out = self.agent_mlps.agent_networks[agent_index].forward(input)
+            with self.agent_mlps.params[agent_index].to_module(self.agent_mlps._empty_net):
+                agent_out  = self.agent_mlps._empty_net(input)
 
         shared_out = self.process_shared_out(shared_out)
 
@@ -166,6 +174,9 @@ def _forward(
             or distance.isnan().any()  # It is the first iteration
             or self.n_agents == 1
         ):
+            distance = self.estimate_snd(input)
+            if update_estimate:
+                self.estimated_snd[:] = distance.detach()
             scaling_ratio = 1.0
         else:  # DiCo scaling
             scaling_ratio = torch.where(
@@ -237,9 +248,14 @@ def estimate_snd(self, obs: torch.Tensor):
         """
         agent_actions = []
         # Gather what actions each agent would take if given the obs tensor
-        for agent_net in self.agent_mlps.agent_networks:
-            agent_outputs = agent_net(obs)
-            agent_actions.append(agent_outputs)
+        # for agent_net in self.agent_mlps.agent_networks:
+        #     agent_outputs = agent_net(obs)
+        #     agent_actions.append(agent_outputs)
+        for agent_index in range(self.n_agents): 
+            with self.agent_mlps.params[agent_index].to_module(self.agent_mlps._empty_net):
+                agent_out  = self.agent_mlps._empty_net(obs)
+                agent_actions.append(agent_out)
+
 
         distance = (
             compute_behavioral_distance(agent_actions=agent_actions, just_mean=True)