diff --git a/README.md b/README.md index 1185067..c21f037 100644 --- a/README.md +++ b/README.md @@ -22,17 +22,7 @@ Watch the presentation video of DiCo. git clone -b het_control https://github.com/proroklab/VectorizedMultiAgentSimulator.git pip install -e VectorizedMultiAgentSimulator -git clone -b het_control https://github.com/matteobettini/tensordict.git -cd tensordict -python setup.py develop -cd .. - -git clone -b het_control https://github.com/matteobettini/rl.git -cd rl -python setup.py develop -cd .. - -git clone -b het_control https://github.com/matteobettini/BenchMARL.git +git clone https://github.com/matteobettini/BenchMARL.git pip install -e BenchMARL ``` 3. Install optional dependencies for logging @@ -45,6 +35,10 @@ git clone https://github.com/proroklab/ControllingBehavioralDiversity.git pip install -e ControllingBehavioralDiversity ``` 5. Try running a script (it will ask for cuda and wandb, you can change these values in `ControllingBehavioralDiversity/het_control/conf/experiment/het_control_experiment.yaml`) + +6. Need to install Matplotlib + + ``` python ControllingBehavioralDiversity/het_control/run_scripts/run_navigation_ippo.py model.desired_snd=0.1 ``` diff --git a/het_control/conf/experiment/het_control_experiment.yaml b/het_control/conf/experiment/het_control_experiment.yaml index 5d1dc1a..c280460 100644 --- a/het_control/conf/experiment/het_control_experiment.yaml +++ b/het_control/conf/experiment/het_control_experiment.yaml @@ -6,11 +6,17 @@ defaults: sampling_device: "cuda" # The device for training (e.g. cuda) train_device: "cuda" +# new BenchMaRL requirements +buffer_device: "cpu" # Whether to share the parameters of the policy within agent groups share_policy_params: True # This won't matter as our model ignores it # If an algorithm and an env support both continuous and discrete actions, what should be preferred prefer_continuous_actions: True +# new BenchMaRL requirements +collect_with_grad: False +# new BenchMaRL requirements +parallel_collection: False # Discount factor gamma: 0.9 @@ -69,6 +75,14 @@ off_policy_train_batch_size: 128 off_policy_memory_size: 1_000_000 # Number of random action frames to prefill the replay buffer with off_policy_init_random_frames: 0 +# new BenchMaRL requirements +# whether to use priorities while sampling from the replay buffer +off_policy_use_prioritized_replay_buffer: False +# exponent that determines how much prioritization is used when off_policy_use_prioritized_replay_buffer = True +# PRB reduces to random sampling when alpha=0 +off_policy_prb_alpha: 0.6 +# importance sampling negative exponent when off_policy_use_prioritized_replay_buffer = True +off_policy_prb_beta: 0.4 evaluation: True # Whether to render the evaluation (if rendering is available @@ -79,6 +93,9 @@ evaluation_interval: 120_000 evaluation_episodes: 200 # If True, when stochastic policies are evaluated, their mode is taken, otherwise, if False, they are sampled evaluation_deterministic_actions: False +# If True, seed the environment before evaluation leading to always the same evaluation env being used +# If False, evaluation environments will be more random throughout training +evaluation_static: False # Absolute path to the folder where the experiment will log. # If null, this will default to the hydra output dir (if using hydra) or to the current folder when the script is run (if not). @@ -91,9 +108,12 @@ checkpoint_interval: 0 # List of loggers to use, options are: wandb, csv, tensorboard, mflow loggers: [wandb] +restore_map_location: null +project_name: "benchmarl" # Create a json folder as part of the output in the format of marl-eval create_json: True - +checkpoint_at_end: False +keep_checkpoints_num: 3 diff --git a/het_control/conf/sampling_iddpg_config.yaml b/het_control/conf/sampling_iddpg_config.yaml index dbfc3af..f104197 100644 --- a/het_control/conf/sampling_iddpg_config.yaml +++ b/het_control/conf/sampling_iddpg_config.yaml @@ -18,4 +18,4 @@ experiment: hydra: searchpath: # Tells hydra to add the default benchmarl configuration to its path - - pkg://benchmarl/conf \ No newline at end of file + - pkg://benchmarl/conf diff --git a/het_control/models/het_control_mlp_empirical.py b/het_control/models/het_control_mlp_empirical.py index 693f855..64ddc4e 100644 --- a/het_control/models/het_control_mlp_empirical.py +++ b/het_control/models/het_control_mlp_empirical.py @@ -74,7 +74,10 @@ def __init__( else None ) # Components that maps std_dev according to scale_mapping - self.input_features = self.input_leaf_spec.shape[-1] + # self.input_features = self.input_leaf_spec.shape[-1] + self.input_features = sum( + [spec.shape[-1] for spec in self.input_spec.values(True, True)] + ) self.output_features = self.output_leaf_spec.shape[-1] self.shared_mlp = MultiAgentMLP( @@ -132,9 +135,12 @@ def _forward( ) -> TensorDictBase: # Gather in_key - input = tensordict.get( - self.in_key - ) # Observation tensor of shape [*batch, n_agents, n_features] + # input = tensordict.get( + # self.in_key + # ) + input = torch.cat([tensordict.get(in_key) for in_key in self.in_keys], dim=-1) + + # Observation tensor of shape [*batch, n_agents, n_features] shared_out = self.shared_mlp.forward(input) if agent_index is None: # Gather outputs for all agents on the obs # tensor of shape [*batch, n_agents, n_actions], where the outputs @@ -143,7 +149,9 @@ def _forward( else: # Gather outputs for one agent on the obs # tensor of shape [*batch, n_agents, n_actions], where the outputs # along the n_agent dimension are taken with the same (agent_index) agent network - agent_out = self.agent_mlps.agent_networks[agent_index].forward(input) + # agent_out = self.agent_mlps.agent_networks[agent_index].forward(input) + with self.agent_mlps.params[agent_index].to_module(self.agent_mlps._empty_net): + agent_out = self.agent_mlps._empty_net(input) shared_out = self.process_shared_out(shared_out) @@ -166,6 +174,9 @@ def _forward( or distance.isnan().any() # It is the first iteration or self.n_agents == 1 ): + distance = self.estimate_snd(input) + if update_estimate: + self.estimated_snd[:] = distance.detach() scaling_ratio = 1.0 else: # DiCo scaling scaling_ratio = torch.where( @@ -237,9 +248,14 @@ def estimate_snd(self, obs: torch.Tensor): """ agent_actions = [] # Gather what actions each agent would take if given the obs tensor - for agent_net in self.agent_mlps.agent_networks: - agent_outputs = agent_net(obs) - agent_actions.append(agent_outputs) + # for agent_net in self.agent_mlps.agent_networks: + # agent_outputs = agent_net(obs) + # agent_actions.append(agent_outputs) + for agent_index in range(self.n_agents): + with self.agent_mlps.params[agent_index].to_module(self.agent_mlps._empty_net): + agent_out = self.agent_mlps._empty_net(obs) + agent_actions.append(agent_out) + distance = ( compute_behavioral_distance(agent_actions=agent_actions, just_mean=True)