From 42cbfbcb0dabcde207f440f6e4126c53a86c83c0 Mon Sep 17 00:00:00 2001 From: veds12 Date: Mon, 14 Feb 2022 11:57:23 +0530 Subject: [PATCH 01/24] added skeleton --- mbrl/algorithms/dreamer.py | 183 ++++++++++++++++++++++++++++++++ mbrl/planning/trajectory_opt.py | 30 ++++++ 2 files changed, 213 insertions(+) create mode 100644 mbrl/algorithms/dreamer.py diff --git a/mbrl/algorithms/dreamer.py b/mbrl/algorithms/dreamer.py new file mode 100644 index 00000000..846c5121 --- /dev/null +++ b/mbrl/algorithms/dreamer.py @@ -0,0 +1,183 @@ +import os +import pathlib +from typing import List, Optional, Union + +import gym +import hydra +import numpy as np +import omegaconf +import torch + +import mbrl.constants +from mbrl.env.termination_fns import no_termination +from mbrl.models import ModelEnv, ModelTrainer +from mbrl.planning import RandomAgent, create_trajectory_optim_agent_for_model, DreamerActorCritic +from mbrl.util.common import ( + create_replay_buffer, + get_sequence_buffer_iterator, + rollout_agent_trajectories, +) + +METRICS_LOG_FORMAT = [ + ("observations_loss", "OL", "float"), + ("reward_loss", "RL", "float"), + ("gradient_norm", "GN", "float"), + ("kl_loss", "KL", "float"), +] + +from models.planet import PlaNetModel + +def train( ## need to modify this + env: gym.Env, + cfg: omegaconf.DictConfig, + silent: bool = False, + work_dir: Union[Optional[str], pathlib.Path] = None, +) -> np.float32: + # Experiment initialization + debug_mode = cfg.get("debug_mode", False) + + if work_dir is None: + work_dir = os.getcwd() + work_dir = pathlib.Path(work_dir) + print(f"Results will be saved at {work_dir}.") + + if silent: + logger = None + else: + logger = Logger(work_dir) + logger.register_group("metrics", METRICS_LOG_FORMAT, color="yellow") + logger.register_group( + mbrl.constants.RESULTS_LOG_NAME, + [ + ("env_step", "S", "int"), + ("train_episode_reward", "RT", "float"), + ("episode_reward", "ET", "float"), + ], + color="green", + ) + + rng = torch.Generator(device=cfg.device) + rng.manual_seed(cfg.seed) + np_rng = np.random.default_rng(seed=cfg.seed) + + # Create replay buffer and collect initial data + replay_buffer = create_replay_buffer( + cfg, + env.observation_space.shape, + env.action_space.shape, + collect_trajectories=True, + rng=np_rng, + ) + rollout_agent_trajectories( + env, + cfg.algorithm.num_initial_trajectories, + RandomAgent(env), + agent_kwargs={}, + replay_buffer=replay_buffer, + collect_full_trajectories=True, + trial_length=cfg.overrides.trial_length, + agent_uses_low_dim_obs=False, + ) + + # Create PlaNet model + cfg.dynamics_model.action_size = env.action_space.shape[0] + planet = hydra.utils.instantiate(cfg.dynamics_model) + assert isinstance(planet, mbrl.models.PlaNetModel) + model_env = ModelEnv(env, planet, no_termination, generator=rng) + trainer = ModelTrainer(planet, logger=logger, optim_lr=1e-3, optim_eps=1e-4) + + # Create CEM agent + # This agent rolls outs trajectories using ModelEnv, which uses planet.sample() + # to simulate the trajectories from the prior transition model + # The starting point for trajectories is conditioned on the latest observation, + # for which we use planet.update_posterior() after each environment step + agent = create_trajectory_optim_agent_for_model(model_env, cfg.algorithm.agent) + + # Callback and containers to accumulate training statistics and average over batch + rec_losses: List[float] = [] + reward_losses: List[float] = [] + kl_losses: List[float] = [] + grad_norms: List[float] = [] + + def get_metrics_and_clear_metric_containers(): + metrics_ = { + "observations_loss": np.mean(rec_losses).item(), + "reward_loss": np.mean(reward_losses).item(), + "gradient_norm": np.mean(grad_norms).item(), + "kl_loss": np.mean(kl_losses).item(), + } + + for c in [rec_losses, reward_losses, kl_losses, grad_norms]: + c.clear() + + return metrics_ + + def batch_callback(_epoch, _loss, meta, _mode): + if meta: + rec_losses.append(meta["observations_loss"]) + reward_losses.append(meta["reward_loss"]) + kl_losses.append(meta["kl_loss"]) + if "grad_norm" in meta: + grad_norms.append(meta["grad_norm"]) + + def is_test_episode(episode_): + return episode_ % cfg.algorithm.test_frequency == 0 + + # PlaNet loop + step = replay_buffer.num_stored + total_rewards = 0.0 + for episode in range(cfg.algorithm.num_episodes): + # Train the model for one epoch of `num_grad_updates` + dataset, _ = get_sequence_buffer_iterator( + replay_buffer, + cfg.overrides.batch_size, + 0, # no validation data + cfg.overrides.sequence_length, + max_batches_per_loop_train=cfg.overrides.num_grad_updates, + use_simple_sampler=True, + ) + trainer.train( + dataset, num_epochs=1, batch_callback=batch_callback, evaluate=False + ) + planet.save(work_dir / "planet.pth") + replay_buffer.save(work_dir) + metrics = get_metrics_and_clear_metric_containers() + logger.log_data("metrics", metrics) + + # Collect one episode of data + episode_reward = 0.0 + obs = env.reset() + agent.reset() + planet.reset_posterior() + action = None + done = False + while not done: + planet.update_posterior(obs, action=action, rng=rng) + action_noise = ( + 0 + if is_test_episode(episode) + else cfg.overrides.action_noise_std + * np_rng.standard_normal(env.action_space.shape[0]) + ) + action = agent.act(obs) + action_noise + action = np.clip(action, -1.0, 1.0) # to account for the noise + next_obs, reward, done, info = env.step(action) + replay_buffer.add(obs, action, next_obs, reward, done) + episode_reward += reward + obs = next_obs + if debug_mode: + print(f"step: {step}, reward: {reward}.") + step += 1 + total_rewards += episode_reward + logger.log_data( + mbrl.constants.RESULTS_LOG_NAME, + { + "episode_reward": episode_reward * is_test_episode(episode), + "train_episode_reward": episode_reward * (1 - is_test_episode(episode)), + "env_step": step, + }, + ) + + # returns average episode reward (e.g., to use for tuning learning curves) + return total_rewards / cfg.algorithm.num_episodes + diff --git a/mbrl/planning/trajectory_opt.py b/mbrl/planning/trajectory_opt.py index 9d857522..051446ee 100644 --- a/mbrl/planning/trajectory_opt.py +++ b/mbrl/planning/trajectory_opt.py @@ -2,6 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import nntplib import time from typing import Callable, List, Optional, Sequence, Tuple, cast @@ -10,6 +11,8 @@ import omegaconf import torch import torch.distributions +import torch.nn as nn +import torch.nn.functional as F import mbrl.models import mbrl.types @@ -486,6 +489,33 @@ def optimize( return mu if self.return_mean_elites else best_solution +class ActorCriticOPtimizer(Optimizer): + """Actor Critic Planning agent used for Dreamers""" + def __init__( + self, + action_dim, + latent_dim, + ): + # self.conv1 = nn.Conv2d(3, 32, 4, stride=4) + # self.conv2 = nn.Conv2d(32, 64, 3, stride=2) + # self.conv3 = nn.Conv2d(64, 64, 3, stride=1) + self.linear1 = nn.Linear(latent_dim, 512) + self.critic_linear = nn.Linear(512, 1) + self.actor_linear = nn.Linear(512, action_dim) + + self.train() + + def act(self, inputs): + x = F.relu(self.conv1(inputs / 255.)) + x = F.relu(self.conv2(x)) + # x = F.relu(self.conv3(x)) + x = x.view(-1, 64 * 6 * 6) + x = F.relu(self.linear1(x)) + + return self.critic_linear(x), self.actor_linear(x) + + def optimize(self, traj, obs): + pass class TrajectoryOptimizer: """Class for using generic optimizers on trajectory optimization problems. From 52218dbe37d931db156d65cda33156fcdbb24986 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Sun, 10 Apr 2022 21:22:15 -0400 Subject: [PATCH 02/24] update gym and test commit hooks --- requirements/main.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/main.txt b/requirements/main.txt index f730fa74..ba2f81fe 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -4,7 +4,7 @@ tensorboard>=2.4.0 imageio>=2.9.0 numpy>=1.19.1 matplotlib>=3.3.1 -gym==0.17.2 +gym>=0.20.0 jupyter>=1.0.0 pytest>=6.0.1 sk-video>=1.1.10 From 3fb3ee9248f5713b72677dba5078736b32fced38 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Sun, 10 Apr 2022 21:28:13 -0400 Subject: [PATCH 03/24] Initial commit --- mbrl/algorithms/dreamer.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mbrl/algorithms/dreamer.py b/mbrl/algorithms/dreamer.py index 846c5121..d35d3b7e 100644 --- a/mbrl/algorithms/dreamer.py +++ b/mbrl/algorithms/dreamer.py @@ -1,3 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. import os import pathlib from typing import List, Optional, Union @@ -11,7 +15,8 @@ import mbrl.constants from mbrl.env.termination_fns import no_termination from mbrl.models import ModelEnv, ModelTrainer -from mbrl.planning import RandomAgent, create_trajectory_optim_agent_for_model, DreamerActorCritic +from mbrl.planning import RandomAgent, create_trajectory_optim_agent_for_model +from mbrl.util import Logger from mbrl.util.common import ( create_replay_buffer, get_sequence_buffer_iterator, @@ -25,9 +30,8 @@ ("kl_loss", "KL", "float"), ] -from models.planet import PlaNetModel -def train( ## need to modify this +def train( env: gym.Env, cfg: omegaconf.DictConfig, silent: bool = False, @@ -180,4 +184,3 @@ def is_test_episode(episode_): # returns average episode reward (e.g., to use for tuning learning curves) return total_rewards / cfg.algorithm.num_episodes - From 630c1ce00ce0a756d50fc63539be984074bbc0e9 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Mon, 11 Apr 2022 00:21:24 -0400 Subject: [PATCH 04/24] pre-commit fixes --- .pre-commit-config.yaml | 3 +-- mbrl/planning/trajectory_opt.py | 30 ------------------------------ pyproyect.toml | 20 -------------------- requirements/dev.txt | 2 +- setup.cfg | 1 - 5 files changed, 2 insertions(+), 54 deletions(-) delete mode 100644 pyproyect.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1a57593b..a59f15ab 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,9 @@ repos: - repo: https://github.com/psf/black - rev: 22.1.0 + rev: 22.3.0 hooks: - id: black files: 'mbrl' - language_version: python3.7 - repo: https://gitlab.com/pycqa/flake8 rev: 3.9.2 diff --git a/mbrl/planning/trajectory_opt.py b/mbrl/planning/trajectory_opt.py index 336b4cfd..5b04d5cb 100644 --- a/mbrl/planning/trajectory_opt.py +++ b/mbrl/planning/trajectory_opt.py @@ -2,7 +2,6 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import nntplib import time from typing import Callable, List, Optional, Sequence, Tuple, cast @@ -11,8 +10,6 @@ import omegaconf import torch import torch.distributions -import torch.nn as nn -import torch.nn.functional as F import mbrl.models import mbrl.types @@ -489,33 +486,6 @@ def optimize( return mu if self.return_mean_elites else best_solution -class ActorCriticOPtimizer(Optimizer): - """Actor Critic Planning agent used for Dreamers""" - def __init__( - self, - action_dim, - latent_dim, - ): - # self.conv1 = nn.Conv2d(3, 32, 4, stride=4) - # self.conv2 = nn.Conv2d(32, 64, 3, stride=2) - # self.conv3 = nn.Conv2d(64, 64, 3, stride=1) - self.linear1 = nn.Linear(latent_dim, 512) - self.critic_linear = nn.Linear(512, 1) - self.actor_linear = nn.Linear(512, action_dim) - - self.train() - - def act(self, inputs): - x = F.relu(self.conv1(inputs / 255.)) - x = F.relu(self.conv2(x)) - # x = F.relu(self.conv3(x)) - x = x.view(-1, 64 * 6 * 6) - x = F.relu(self.linear1(x)) - - return self.critic_linear(x), self.actor_linear(x) - - def optimize(self, traj, obs): - pass class TrajectoryOptimizer: """Class for using generic optimizers on trajectory optimization problems. diff --git a/pyproyect.toml b/pyproyect.toml deleted file mode 100644 index 43e6846b..00000000 --- a/pyproyect.toml +++ /dev/null @@ -1,20 +0,0 @@ -[build-system] -requires = [ - "setuptools>=42", - "wheel" -] -build-backend = "setuptools.build_meta" - -[tool.black] -line-length = 88 -exclude = ''' -( - /( - .eggs # exclude a few common directories in the - | .git # root of the project - | .mypy_cache - | docs - | *personal* - ) -) -''' \ No newline at end of file diff --git a/requirements/dev.txt b/requirements/dev.txt index 5784d729..e7773d0b 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -3,7 +3,7 @@ nbsphinx>=0.8.0 sphinx-rtd-theme>=0.5.0 flake8>=3.8.4 mypy>=0.902 -black>=21.4b2 +black>=22.3.0 pytest>=6.0.1 types-pyyaml>=0.1.6 types-termcolor>=0.1.0 diff --git a/setup.cfg b/setup.cfg index 0704346f..dbf4d08c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,7 +11,6 @@ exclude = mbrl/third_party/* [mypy] -python_version = 3.7 ignore_missing_imports = True show_error_codes = True strict_optional = False From d591586fe7d455971e80682602f091c19ec2f581 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Mon, 11 Apr 2022 00:22:16 -0400 Subject: [PATCH 05/24] fixed pyproject.toml --- pyproject.toml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..19385131 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" + +[tool.black] +line-length = 88 +exclude = ''' +( + /( + .eggs # exclude a few common directories in the + | .git # root of the project + | .mypy_cache + | docs + ) +) +''' \ No newline at end of file From 85e7c14fb12110d9adfc8ee07a34b7a5827f80f5 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 12 Apr 2022 13:59:46 -0400 Subject: [PATCH 06/24] dreamer core; bug fixes --- mbrl/algorithms/dreamer.py | 65 +++++++++---- mbrl/algorithms/planet.py | 5 +- mbrl/models/planet.py | 2 +- mbrl/planning/__init__.py | 1 + mbrl/planning/core.py | 13 ++- mbrl/planning/dreamer_agent.py | 164 +++++++++++++++++++++++++++++++++ 6 files changed, 225 insertions(+), 25 deletions(-) create mode 100644 mbrl/planning/dreamer_agent.py diff --git a/mbrl/algorithms/dreamer.py b/mbrl/algorithms/dreamer.py index d35d3b7e..72d252c5 100644 --- a/mbrl/algorithms/dreamer.py +++ b/mbrl/algorithms/dreamer.py @@ -11,11 +11,12 @@ import numpy as np import omegaconf import torch +from tqdm import tqdm import mbrl.constants from mbrl.env.termination_fns import no_termination -from mbrl.models import ModelEnv, ModelTrainer -from mbrl.planning import RandomAgent, create_trajectory_optim_agent_for_model +from mbrl.models import ModelEnv, ModelTrainer, PlaNetModel +from mbrl.planning import DreamerAgent, RandomAgent, create_dreamer_agent_for_model from mbrl.util import Logger from mbrl.util.common import ( create_replay_buffer, @@ -85,52 +86,74 @@ def train( # Create PlaNet model cfg.dynamics_model.action_size = env.action_space.shape[0] - planet = hydra.utils.instantiate(cfg.dynamics_model) - assert isinstance(planet, mbrl.models.PlaNetModel) + planet: PlaNetModel = hydra.utils.instantiate(cfg.dynamics_model) model_env = ModelEnv(env, planet, no_termination, generator=rng) trainer = ModelTrainer(planet, logger=logger, optim_lr=1e-3, optim_eps=1e-4) - # Create CEM agent + # Create Dreamer agent # This agent rolls outs trajectories using ModelEnv, which uses planet.sample() # to simulate the trajectories from the prior transition model - # The starting point for trajectories is conditioned on the latest observation, - # for which we use planet.update_posterior() after each environment step - agent = create_trajectory_optim_agent_for_model(model_env, cfg.algorithm.agent) + # The starting point for trajectories is each imagined state output by the + # representation model from the dataset of environment observations + agent: DreamerAgent = create_dreamer_agent_for_model( + planet, model_env, cfg.algorithm.agent + ) # Callback and containers to accumulate training statistics and average over batch rec_losses: List[float] = [] reward_losses: List[float] = [] + policy_losses: List[float] = [] + critic_losses: List[float] = [] kl_losses: List[float] = [] - grad_norms: List[float] = [] + model_grad_norms: List[float] = [] + agent_grad_norms: List[float] = [] def get_metrics_and_clear_metric_containers(): metrics_ = { "observations_loss": np.mean(rec_losses).item(), "reward_loss": np.mean(reward_losses).item(), - "gradient_norm": np.mean(grad_norms).item(), + "policy_loss": np.mean(policy_losses).item(), + "critic_loss": np.mean(critic_losses).item(), + "model_gradient_norm": np.mean(model_grad_norms).item(), + "agent_gradient_norm": np.mean(agent_grad_norms).item(), "kl_loss": np.mean(kl_losses).item(), } - for c in [rec_losses, reward_losses, kl_losses, grad_norms]: + for c in [ + rec_losses, + reward_losses, + policy_losses, + critic_losses, + kl_losses, + model_grad_norms, + agent_grad_norms, + ]: c.clear() return metrics_ - def batch_callback(_epoch, _loss, meta, _mode): + def model_batch_callback(_epoch, _loss, meta, _mode): if meta: rec_losses.append(meta["observations_loss"]) reward_losses.append(meta["reward_loss"]) kl_losses.append(meta["kl_loss"]) if "grad_norm" in meta: - grad_norms.append(meta["grad_norm"]) + model_grad_norms.append(meta["grad_norm"]) + + def agent_batch_callback(_epoch, _loss, meta, _mode): + if meta: + policy_losses.append(meta["policy_loss"]) + critic_losses.append(meta["critic_loss"]) + if "grad_norm" in meta: + agent_grad_norms.append(meta["grad_norm"]) def is_test_episode(episode_): return episode_ % cfg.algorithm.test_frequency == 0 - # PlaNet loop + # Dreamer loop step = replay_buffer.num_stored total_rewards = 0.0 - for episode in range(cfg.algorithm.num_episodes): + for episode in tqdm(range(cfg.algorithm.num_episodes)): # Train the model for one epoch of `num_grad_updates` dataset, _ = get_sequence_buffer_iterator( replay_buffer, @@ -141,9 +164,13 @@ def is_test_episode(episode_): use_simple_sampler=True, ) trainer.train( - dataset, num_epochs=1, batch_callback=batch_callback, evaluate=False + dataset, num_epochs=1, batch_callback=model_batch_callback, evaluate=False + ) + agent.train( + dataset, num_epochs=1, batch_callback=agent_batch_callback, evaluate=False ) - planet.save(work_dir / "planet.pth") + planet.save(work_dir) + agent.save(work_dir) replay_buffer.save(work_dir) metrics = get_metrics_and_clear_metric_containers() logger.log_data("metrics", metrics) @@ -156,14 +183,14 @@ def is_test_episode(episode_): action = None done = False while not done: - planet.update_posterior(obs, action=action, rng=rng) + latent_state = planet.update_posterior(obs, action=action, rng=rng) action_noise = ( 0 if is_test_episode(episode) else cfg.overrides.action_noise_std * np_rng.standard_normal(env.action_space.shape[0]) ) - action = agent.act(obs) + action_noise + action = agent.act(latent_state) + action_noise action = np.clip(action, -1.0, 1.0) # to account for the noise next_obs, reward, done, info = env.step(action) replay_buffer.add(obs, action, next_obs, reward, done) diff --git a/mbrl/algorithms/planet.py b/mbrl/algorithms/planet.py index d35d3b7e..eec065f0 100644 --- a/mbrl/algorithms/planet.py +++ b/mbrl/algorithms/planet.py @@ -11,6 +11,7 @@ import numpy as np import omegaconf import torch +from tqdm import tqdm import mbrl.constants from mbrl.env.termination_fns import no_termination @@ -130,7 +131,7 @@ def is_test_episode(episode_): # PlaNet loop step = replay_buffer.num_stored total_rewards = 0.0 - for episode in range(cfg.algorithm.num_episodes): + for episode in tqdm(range(cfg.algorithm.num_episodes)): # Train the model for one epoch of `num_grad_updates` dataset, _ = get_sequence_buffer_iterator( replay_buffer, @@ -143,7 +144,7 @@ def is_test_episode(episode_): trainer.train( dataset, num_epochs=1, batch_callback=batch_callback, evaluate=False ) - planet.save(work_dir / "planet.pth") + planet.save(work_dir) replay_buffer.save(work_dir) metrics = get_metrics_and_clear_metric_containers() logger.log_data("metrics", metrics) diff --git a/mbrl/models/planet.py b/mbrl/models/planet.py index 8f206413..e8841755 100644 --- a/mbrl/models/planet.py +++ b/mbrl/models/planet.py @@ -101,7 +101,7 @@ def forward( class MeanStdCat(nn.Module): - # Convenience module to avoid having to write chuck and softplus in multiple places + # Convenience module to avoid having to write chunk and softplus in multiple places # (since it's needed for prior and posterior params) def __init__(self, latent_state_size: int, min_std: float): super().__init__() diff --git a/mbrl/planning/__init__.py b/mbrl/planning/__init__.py index 772afcfb..9fda86db 100644 --- a/mbrl/planning/__init__.py +++ b/mbrl/planning/__init__.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .core import Agent, RandomAgent, complete_agent_cfg, load_agent +from .dreamer_agent import DreamerAgent, create_dreamer_agent_for_model from .trajectory_opt import ( CEMOptimizer, ICEMOptimizer, diff --git a/mbrl/planning/core.py b/mbrl/planning/core.py index 29ca7f93..a2fdd269 100644 --- a/mbrl/planning/core.py +++ b/mbrl/planning/core.py @@ -150,8 +150,15 @@ def load_agent(agent_path: Union[str, pathlib.Path], env: gym.Env) -> Agent: from .sac_wrapper import SACAgent complete_agent_cfg(env, cfg.algorithm.agent) - agent: pytorch_sac.SAC = hydra.utils.instantiate(cfg.algorithm.agent) - agent.load_checkpoint(ckpt_path=agent_path / "sac.pth") - return SACAgent(agent) + sac: pytorch_sac.SAC = hydra.utils.instantiate(cfg.algorithm.agent) + sac.load_checkpoint(ckpt_path=agent_path / "sac.pth") + return SACAgent(sac) + elif cfg.algorithm.agent == "mbrl.planning.dreamer_agent.DreamerAgent": + from mbrl.planning.dreamer_agent import DreamerAgent + + complete_agent_cfg(env, cfg.algorithm.agent) + dreamer_agent: DreamerAgent = hydra.utils.instantiate(cfg.algorithm.agent) + dreamer_agent.load(agent_path) + return dreamer_agent else: raise ValueError("Invalid agent configuration.") diff --git a/mbrl/planning/dreamer_agent.py b/mbrl/planning/dreamer_agent.py new file mode 100644 index 00000000..fe4b806c --- /dev/null +++ b/mbrl/planning/dreamer_agent.py @@ -0,0 +1,164 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import pathlib +from typing import Callable, Dict, List, Optional, Tuple, Union + +import hydra +import numpy as np +import omegaconf +import torch +from torch import nn +from torch.distributions import TanhTransform +from torch.nn import functional as F +from torch.optim import Adam + +import mbrl.models +from mbrl.util.replay_buffer import TransitionIterator + +from .core import Agent, complete_agent_cfg + + +class Policy(nn.Module): + def __init__( + self, + latent_size: int, + action_size: int, + hidden_size: int, + min_std: float = 1e-4, + init_std: float = 5, + mean_scale: float = 5, + ): + super().__init__() + self.model = nn.Sequential( + nn.Linear(latent_size, hidden_size), + nn.ELU(), + nn.Linear(hidden_size, hidden_size), + nn.ELU(), + nn.Linear(hidden_size, action_size * 2), + ) + self.min_std = min_std + self.init_std = init_std + self.mean_scale = mean_scale + self.raw_init_std = np.log(np.exp(self.init_std) - 1) + + def forward(self, latent_state): + latent_state = torch.cat(latent_state.values(), dim=-1) + model_out = self.model(latent_state) + mean, std = torch.chunk(model_out, 2, -1) + mean = self.mean_scale * torch.tanh(mean / self.mean_scale) + std = F.softplus(std + self.raw_init_std) + self.min_std + dist = torch.distributions.Normal(mean, std) + dist = torch.distributions.TransformedDistribution(dist, TanhTransform()) + dist = torch.distributions.Independent(dist, 1) + return dist + + +class DreamerAgent(Agent): + def __init__( + self, + planet: mbrl.models.PlaNetModel, + model_env: mbrl.models.ModelEnv, + device: torch.device, + hidden_size_fcs: int = 200, + horizon: int = 15, + policy_lr: float = 8e-5, + critic_lr: float = 8e-5, + grad_clip_norm: float = 1000.0, + rng: Optional[torch.Generator] = None, + ) -> None: + self.planet: mbrl.models.PlaNetModel = planet + self.model_env = model_env + self.device = device + self.horizon = horizon + self.latent_size = planet.latent_state_size + planet.belief_size + self.action_size = planet.action_size + + self.policy = Policy( + self.latent_size, + self.action_size, + hidden_size_fcs, + ).to(device) + self.policy_optim = Adam(self.policy.parameters(), policy_lr) + self.critic = nn.Sequential( + nn.Linear(self.latent_size, hidden_size_fcs), + nn.ELU(), + nn.Linear(hidden_size_fcs, hidden_size_fcs), + nn.ELU(), + nn.Linear(hidden_size_fcs, 1), + ).to(device) + self.critic_optim = Adam(self.critic.parameters(), critic_lr) + + def act(self, obs: Dict[str, mbrl.types.TensorType], **_kwargs) -> np.ndarray: + action_dist = self.policy(obs) + action = action_dist.sample() + return action.cpu().detach().numpy() + + def train( + self, + dataset_train: TransitionIterator, + dataset_val: Optional[TransitionIterator] = None, + num_epochs: Optional[int] = None, + patience: Optional[int] = None, + improvement_threshold: float = 0.01, + callback: Optional[Callable] = None, + batch_callback: Optional[Callable] = None, + evaluate: bool = True, + silent: bool = False, + ) -> Tuple[List[float], List[float]]: + raise NotImplementedError + + def save(self, save_dir: Union[str, pathlib.Path]): + """Saves the agent to the given directory.""" + save_path = pathlib.Path(save_dir) / "agent.pth" + print("Saving models to {}".format(save_path)) + torch.save( + { + "policy_state_dict": self.policy.state_dict(), + "actor_optimizer_state_dict": self.policy_optim.state_dict(), + "critic_state_dict": self.critic.state_dict(), + "critic_optimizer_state_dict": self.critic_optim.state_dict(), + }, + save_path, + ) + + def load(self, load_dir: Union[str, pathlib.Path], evaluate=False): + """Loads the agent from the given directory.""" + load_path = pathlib.Path(load_dir) / "agent.pth" + print("Saving models to {}".format(load_path)) + checkpoint = torch.load(load_path) + self.policy.load_state_dict(checkpoint["policy_state_dict"]) + self.policy_optim.load_state_dict(checkpoint["policy_optimizer_state_dict"]) + self.critic.load_state_dict(checkpoint["critic_state_dict"]) + self.critic_optim.load_state_dict(checkpoint["critic_optimizer_state_dict"]) + + if evaluate: + self.policy.eval() + self.critic.eval() + else: + self.policy.train() + self.critic.train() + + +def create_dreamer_agent_for_model( + planet: mbrl.models.PlaNetModel, + model_env: mbrl.models.ModelEnv, + agent_cfg: omegaconf.DictConfig, +) -> DreamerAgent: + """Utility function for creating an dreamer agent for a model environment. + + This is a convenience function for creating a :class:`DreamerAgent` + + + Args: + model_env (mbrl.models.ModelEnv): the model environment. + agent_cfg (omegaconf.DictConfig): the agent's configuration. + + Returns: + (:class:`DreamerAgent`): the agent. + + """ + complete_agent_cfg(model_env, agent_cfg) + agent = hydra.utils.instantiate(agent_cfg) + return agent From fd38c62bb92fd772f0250f0c9e0d5f796024cbd0 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 12 Apr 2022 16:22:03 -0400 Subject: [PATCH 07/24] dtype fix --- mbrl/algorithms/dreamer.py | 4 +++- mbrl/algorithms/planet.py | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/mbrl/algorithms/dreamer.py b/mbrl/algorithms/dreamer.py index 72d252c5..448266cf 100644 --- a/mbrl/algorithms/dreamer.py +++ b/mbrl/algorithms/dreamer.py @@ -191,7 +191,9 @@ def is_test_episode(episode_): * np_rng.standard_normal(env.action_space.shape[0]) ) action = agent.act(latent_state) + action_noise - action = np.clip(action, -1.0, 1.0) # to account for the noise + action = np.clip( + action, -1.0, 1.0, dtype=env.action_space.dtype + ) # to account for the noise and fix dtype next_obs, reward, done, info = env.step(action) replay_buffer.add(obs, action, next_obs, reward, done) episode_reward += reward diff --git a/mbrl/algorithms/planet.py b/mbrl/algorithms/planet.py index eec065f0..af679076 100644 --- a/mbrl/algorithms/planet.py +++ b/mbrl/algorithms/planet.py @@ -156,6 +156,8 @@ def is_test_episode(episode_): planet.reset_posterior() action = None done = False + pbar = tqdm(total=500) + breakpoint() while not done: planet.update_posterior(obs, action=action, rng=rng) action_noise = ( @@ -165,7 +167,9 @@ def is_test_episode(episode_): * np_rng.standard_normal(env.action_space.shape[0]) ) action = agent.act(obs) + action_noise - action = np.clip(action, -1.0, 1.0) # to account for the noise + action = np.clip( + action, -1.0, 1.0, dtype=env.action_space.dtype + ) # to account for the noise and fix dtype next_obs, reward, done, info = env.step(action) replay_buffer.add(obs, action, next_obs, reward, done) episode_reward += reward @@ -173,6 +177,8 @@ def is_test_episode(episode_): if debug_mode: print(f"step: {step}, reward: {reward}.") step += 1 + pbar.update(1) + pbar.close() total_rewards += episode_reward logger.log_data( mbrl.constants.RESULTS_LOG_NAME, From f44d50093b9538115f036c919826362ddd40efc4 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 12 Apr 2022 16:39:40 -0400 Subject: [PATCH 08/24] remove breakpoint --- mbrl/algorithms/dreamer.py | 3 +++ mbrl/algorithms/planet.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mbrl/algorithms/dreamer.py b/mbrl/algorithms/dreamer.py index 448266cf..55c9e1f7 100644 --- a/mbrl/algorithms/dreamer.py +++ b/mbrl/algorithms/dreamer.py @@ -182,6 +182,7 @@ def is_test_episode(episode_): planet.reset_posterior() action = None done = False + pbar = tqdm(total=500) while not done: latent_state = planet.update_posterior(obs, action=action, rng=rng) action_noise = ( @@ -201,6 +202,8 @@ def is_test_episode(episode_): if debug_mode: print(f"step: {step}, reward: {reward}.") step += 1 + pbar.update(1) + pbar.close() total_rewards += episode_reward logger.log_data( mbrl.constants.RESULTS_LOG_NAME, diff --git a/mbrl/algorithms/planet.py b/mbrl/algorithms/planet.py index af679076..120f5e9d 100644 --- a/mbrl/algorithms/planet.py +++ b/mbrl/algorithms/planet.py @@ -157,7 +157,6 @@ def is_test_episode(episode_): action = None done = False pbar = tqdm(total=500) - breakpoint() while not done: planet.update_posterior(obs, action=action, rng=rng) action_noise = ( From 6b845fbdaf0899488b4aa06b661e42a6bbe0dae9 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Thu, 14 Apr 2022 00:08:46 -0400 Subject: [PATCH 09/24] working on config --- mbrl/examples/conf/algorithm/dreamer.yaml | 14 ++++++++++++++ mbrl/examples/main.py | 3 +++ mbrl/planning/dreamer_agent.py | 19 +++++++++++-------- setup.py | 3 ++- 4 files changed, 30 insertions(+), 9 deletions(-) create mode 100644 mbrl/examples/conf/algorithm/dreamer.yaml diff --git a/mbrl/examples/conf/algorithm/dreamer.yaml b/mbrl/examples/conf/algorithm/dreamer.yaml new file mode 100644 index 00000000..2689e961 --- /dev/null +++ b/mbrl/examples/conf/algorithm/dreamer.yaml @@ -0,0 +1,14 @@ +# @package _group_ +name: "dreamer" + +agent: + _target_: mbrl.planning.DreamerAgent + planning_horizon: ${overrides.planning_horizon} + optimizer_cfg: ${action_optimizer} + verbose: ${debug_mode} + +num_initial_trajectories: 5 +action_noise_std: 0.3 +test_frequency: 25 +num_episodes: 1000 +dataset_size: 1000000 \ No newline at end of file diff --git a/mbrl/examples/main.py b/mbrl/examples/main.py index c2e1a57f..ec6cb17e 100644 --- a/mbrl/examples/main.py +++ b/mbrl/examples/main.py @@ -7,6 +7,7 @@ import omegaconf import torch +import mbrl.algorithms.dreamer as dreamer import mbrl.algorithms.mbpo as mbpo import mbrl.algorithms.pets as pets import mbrl.algorithms.planet as planet @@ -25,6 +26,8 @@ def run(cfg: omegaconf.DictConfig): return mbpo.train(env, test_env, term_fn, cfg) if cfg.algorithm.name == "planet": return planet.train(env, cfg) + if cfg.algorithm.name == "dreamer": + return dreamer.train(env, cfg) if __name__ == "__main__": diff --git a/mbrl/planning/dreamer_agent.py b/mbrl/planning/dreamer_agent.py index fe4b806c..ce8ffbf6 100644 --- a/mbrl/planning/dreamer_agent.py +++ b/mbrl/planning/dreamer_agent.py @@ -17,7 +17,7 @@ import mbrl.models from mbrl.util.replay_buffer import TransitionIterator -from .core import Agent, complete_agent_cfg +from .core import Agent class Policy(nn.Module): @@ -58,9 +58,10 @@ def forward(self, latent_state): class DreamerAgent(Agent): def __init__( self, - planet: mbrl.models.PlaNetModel, - model_env: mbrl.models.ModelEnv, device: torch.device, + latent_state_size: int, + belief_size: int, + action_size: int, hidden_size_fcs: int = 200, horizon: int = 15, policy_lr: float = 8e-5, @@ -68,12 +69,11 @@ def __init__( grad_clip_norm: float = 1000.0, rng: Optional[torch.Generator] = None, ) -> None: - self.planet: mbrl.models.PlaNetModel = planet - self.model_env = model_env + self.planet: mbrl.models.PlaNetModel = None self.device = device self.horizon = horizon - self.latent_size = planet.latent_state_size + planet.belief_size - self.action_size = planet.action_size + self.latent_size = latent_state_size + belief_size + self.action_size = action_size self.policy = Policy( self.latent_size, @@ -159,6 +159,9 @@ def create_dreamer_agent_for_model( (:class:`DreamerAgent`): the agent. """ - complete_agent_cfg(model_env, agent_cfg) + agent_cfg.latent_state_size = planet.latent_state_size + agent_cfg.belief_size = planet.belief_size + agent_cfg.action_size = planet.action_size agent = hydra.utils.instantiate(agent_cfg) + agent.planet = planet return agent diff --git a/setup.py b/setup.py index e260abe9..6b174373 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,8 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from pathlib import Path -from setuptools import setup, find_packages + +from setuptools import find_packages, setup def parse_requirements_file(path): From a6e6a2a89559ca97bcddcfcdad72a8322395b9de Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Mon, 30 May 2022 00:57:54 -0700 Subject: [PATCH 10/24] wip --- mbrl/examples/conf/algorithm/dreamer.yaml | 15 +- .../conf/overrides/dreamer_walker_walk.yaml | 30 +++ mbrl/planning/dreamer_agent.py | 247 ++++++++++++++---- 3 files changed, 244 insertions(+), 48 deletions(-) create mode 100644 mbrl/examples/conf/overrides/dreamer_walker_walk.yaml diff --git a/mbrl/examples/conf/algorithm/dreamer.yaml b/mbrl/examples/conf/algorithm/dreamer.yaml index 2689e961..c24d9e9a 100644 --- a/mbrl/examples/conf/algorithm/dreamer.yaml +++ b/mbrl/examples/conf/algorithm/dreamer.yaml @@ -3,9 +3,18 @@ name: "dreamer" agent: _target_: mbrl.planning.DreamerAgent - planning_horizon: ${overrides.planning_horizon} - optimizer_cfg: ${action_optimizer} - verbose: ${debug_mode} + action_lb: ??? + action_ub: ??? + horizon: 15 + policy_lr: 0.00008 + critic_lr: 0.00008 + gamma: 0.99 + lam: 0.95 + grad_clip_norm: 100.0 + min_std: 0.0001 + init_std: 5 + mean_scale: 5 + activation_function: "elu" num_initial_trajectories: 5 action_noise_std: 0.3 diff --git a/mbrl/examples/conf/overrides/dreamer_walker_walk.yaml b/mbrl/examples/conf/overrides/dreamer_walker_walk.yaml new file mode 100644 index 00000000..c698bacb --- /dev/null +++ b/mbrl/examples/conf/overrides/dreamer_walker_walk.yaml @@ -0,0 +1,30 @@ +# @package _group_ +env: "dmcontrol_walker_walk" # used to set the hydra dir, ignored otherwise + +env_cfg: + _target_: "mbrl.third_party.dmc2gym.wrappers.DMCWrapper" + domain_name: "walker" + task_name: "walk" + task_kwargs: + random: ${seed} + visualize_reward: false + from_pixels: true + height: 64 + width: 64 + frame_skip: 2 + bit_depth: 5 + +term_fn: "no_termination" + +# General configuration overrides +trial_length: 500 +action_noise_std: 0.3 + +# Model overrides +num_grad_updates: 1 # Why is this 100 for PlaNet? +sequence_length: 50 +batch_size: 50 +free_nats: 3 +kl_scale: 1.0 + +# Dreamer configuration overrides diff --git a/mbrl/planning/dreamer_agent.py b/mbrl/planning/dreamer_agent.py index ce8ffbf6..ed94a1be 100644 --- a/mbrl/planning/dreamer_agent.py +++ b/mbrl/planning/dreamer_agent.py @@ -3,24 +3,41 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import pathlib -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union import hydra import numpy as np import omegaconf import torch from torch import nn -from torch.distributions import TanhTransform +from torch.distributions import ( + Independent, + Normal, + TanhTransform, + TransformedDistribution, +) from torch.nn import functional as F from torch.optim import Adam import mbrl.models +from mbrl.models.planet import PlaNetModel +from mbrl.types import TensorType from mbrl.util.replay_buffer import TransitionIterator -from .core import Agent +from .core import Agent, complete_agent_cfg -class Policy(nn.Module): +def freeze(module: nn.Module): + for p in module.parameters(): + p.requires_grad = False + + +def unfreeze(module: nn.Module): + for p in module.parameters(): + p.requires_grad = True + + +class PolicyModel(nn.Module): def __init__( self, latent_size: int, @@ -29,70 +46,103 @@ def __init__( min_std: float = 1e-4, init_std: float = 5, mean_scale: float = 5, + activation_function="elu", ): super().__init__() - self.model = nn.Sequential( - nn.Linear(latent_size, hidden_size), - nn.ELU(), - nn.Linear(hidden_size, hidden_size), - nn.ELU(), - nn.Linear(hidden_size, action_size * 2), - ) + self.act_fn = getattr(F, activation_function) + self.fc1 = nn.Linear(latent_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, hidden_size) + self.fc3 = nn.Linear(hidden_size, action_size * 2) self.min_std = min_std self.init_std = init_std self.mean_scale = mean_scale self.raw_init_std = np.log(np.exp(self.init_std) - 1) - def forward(self, latent_state): - latent_state = torch.cat(latent_state.values(), dim=-1) - model_out = self.model(latent_state) + def forward(self, belief, state): + hidden = self.act_fn(self.fc1(torch.cat([belief, state], dim=1))) + hidden = self.act_fn(self.fc2(hidden)) + model_out = self.fc3(hidden).squeeze(dim=1) mean, std = torch.chunk(model_out, 2, -1) mean = self.mean_scale * torch.tanh(mean / self.mean_scale) std = F.softplus(std + self.raw_init_std) + self.min_std - dist = torch.distributions.Normal(mean, std) - dist = torch.distributions.TransformedDistribution(dist, TanhTransform()) - dist = torch.distributions.Independent(dist, 1) + dist = Normal(mean, std) + dist = TransformedDistribution(dist, TanhTransform()) + dist = Independent(dist, 1) return dist +class ValueModel(nn.Module): + def __init__(self, latent_size, hidden_size, activation_function="elu"): + super().__init__() + self.act_fn = getattr(F, activation_function) + self.fc1 = nn.Linear(latent_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, hidden_size) + self.fc3 = nn.Linear(hidden_size, 1) + + def forward(self, belief, state): + hidden = self.act_fn(self.fc1(torch.cat([belief, state], dim=1))) + hidden = self.act_fn(self.fc2(hidden)) + value = self.fc3(hidden).squeeze(dim=1) + return value + + class DreamerAgent(Agent): def __init__( self, - device: torch.device, - latent_state_size: int, - belief_size: int, action_size: int, - hidden_size_fcs: int = 200, + action_lb: Sequence[float] = [-1.0], + action_ub: Sequence[float] = [1.0], + belief_size: int = 200, + latent_state_size: int = 30, + hidden_size: int = 300, horizon: int = 15, policy_lr: float = 8e-5, + min_std: float = 1e-4, + init_std: float = 5, + mean_scale: float = 5, critic_lr: float = 8e-5, - grad_clip_norm: float = 1000.0, - rng: Optional[torch.Generator] = None, - ) -> None: - self.planet: mbrl.models.PlaNetModel = None - self.device = device - self.horizon = horizon - self.latent_size = latent_state_size + belief_size + gamma: float = 0.99, + lam: float = 0.95, + grad_clip_norm: float = 100.0, + activation_function: str = "elu", + device: Union[str, torch.device] = "cpu", + ): + super().__init__() + self.belief_size = belief_size + self.latent_state_size = latent_state_size self.action_size = action_size + self.gamma = gamma + self.lam = lam + self.grad_clip_norm = grad_clip_norm + self.horizon = horizon + self.action_lb = action_lb + self.action_ub = action_ub + self.device = device + self.planet_model: PlaNetModel = None - self.policy = Policy( - self.latent_size, - self.action_size, - hidden_size_fcs, + self.policy = PolicyModel( + belief_size + latent_state_size, + action_size, + hidden_size, + min_std, + init_std, + mean_scale, + activation_function, ).to(device) self.policy_optim = Adam(self.policy.parameters(), policy_lr) - self.critic = nn.Sequential( - nn.Linear(self.latent_size, hidden_size_fcs), - nn.ELU(), - nn.Linear(hidden_size_fcs, hidden_size_fcs), - nn.ELU(), - nn.Linear(hidden_size_fcs, 1), + self.critic = ValueModel( + belief_size + latent_state_size, hidden_size, activation_function ).to(device) self.critic_optim = Adam(self.critic.parameters(), critic_lr) - def act(self, obs: Dict[str, mbrl.types.TensorType], **_kwargs) -> np.ndarray: - action_dist = self.policy(obs) - action = action_dist.sample() + def act( + self, obs: Dict[str, TensorType], training: bool = True, **_kwargs + ) -> TensorType: + action_dist = self.policy(obs["belief"], obs["latent"]) + if training: + action = action_dist.rsample() + else: + action = action_dist.mode() return action.cpu().detach().numpy() def train( @@ -107,7 +157,84 @@ def train( evaluate: bool = True, silent: bool = False, ) -> Tuple[List[float], List[float]]: + raise NotImplementedError + """ + eval_dataset = dataset_train if dataset_val is None else dataset_val + + training_losses, val_scores = [], [] + best_weights: Optional[Dict] = None + epoch_iter = range(num_epochs) if num_epochs else itertools.count() + epochs_since_update = 0 + best_val_score = self.evaluate(eval_dataset) if evaluate else None + # only enable tqdm if training for a single epoch, + # otherwise it produces too much output + disable_tqdm = silent or (num_epochs is None or num_epochs > 1) + + for batch in dataset_train: + B, L, _ = beliefs.shape + beliefs = torch.reshape(beliefs, [B * L, -1]) + states = torch.reshape(states, [B * L, -1]) + imag_beliefs = [] + imag_states = [] + imag_actions = [] + imag_rewards = [] + for _ in range(self.horizon): + state = {"belief": beliefs, "latent": states} + actions = self.act(beliefs, states) + imag_beliefs.append(beliefs) + imag_states.append(states) + imag_actions.append(actions) + + states, rewards = self.planet_model.sample(actions, states) + imag_rewards.append(rewards) + + # I x (B*L) x _ + imag_beliefs = torch.stack(imag_beliefs).to(self.device) + imag_states = torch.stack(imag_states).to(self.device) + imag_actions = torch.stack(imag_actions).to(self.device) + freeze(self.critic) + imag_values = self.critic({"belief": imag_beliefs, "latent": imag_states}) + unfreeze(self.critic) + + discount_arr = self.gamma * torch.ones_like(imag_rewards) + returns = self._compute_return( + imag_rewards[:-1], + imag_values[:-1], + discount_arr[:-1], + bootstrap=imag_values[-1], + lambda_=self.lam, + ) + # Make the top row 1 so the cumulative product starts with discount^0 + discount_arr = torch.cat( + [torch.ones_like(discount_arr[:1]), discount_arr[1:]] + ) + discount = torch.cumprod(discount_arr[:-1], 0) + policy_loss = -torch.mean(discount * returns) + + # Detach tensors which have gradients through policy model for value loss + value_beliefs = imag_beliefs.detach()[:-1] + value_states = imag_states.detach()[:-1] + value_discount = discount.detach() + value_target = returns.detach() + state = {"belief": value_beliefs, "latent": value_states} + value_pred = self.critic(state) + value_loss = F.mse_loss(value_discount * value_target, value_pred) + + self.policy_optim.zero_grad() + self.critic_optim.zero_grad() + + nn.utils.clip_grad_norm_( + self.policy_model.parameters(), self.grad_clip_norm + ) + nn.utils.clip_grad_norm_(self.value_model.parameters(), self.grad_clip_norm) + + policy_loss.backward() + value_loss.backward() + + self.policy_optim.step() + self.critic_optim.step() + """ def save(self, save_dir: Union[str, pathlib.Path]): """Saves the agent to the given directory.""" @@ -140,6 +267,33 @@ def load(self, load_dir: Union[str, pathlib.Path], evaluate=False): self.policy.train() self.critic.train() + def _compute_return( + self, + reward: torch.Tensor, + value: torch.Tensor, + discount: torch.Tensor, + bootstrap: torch.Tensor, + lambda_: float, + ): + """ + Compute the discounted reward for a batch of data. + reward, value, and discount are all shape [horizon - 1, batch, 1] + (last element is cut off) + Bootstrap is [batch, 1] + """ + next_values = torch.cat([value[1:], bootstrap[None]], 0) + target = reward + discount * next_values * (1 - lambda_) + timesteps = list(range(reward.shape[0] - 1, -1, -1)) + outputs = [] + accumulated_reward = bootstrap + for t in timesteps: + inp = target[t] + discount_factor = discount[t] + accumulated_reward = inp + discount_factor * lambda_ * accumulated_reward + outputs.append(accumulated_reward) + returns = torch.flip(torch.stack(outputs), [0]) + return returns + def create_dreamer_agent_for_model( planet: mbrl.models.PlaNetModel, @@ -159,9 +313,12 @@ def create_dreamer_agent_for_model( (:class:`DreamerAgent`): the agent. """ - agent_cfg.latent_state_size = planet.latent_state_size - agent_cfg.belief_size = planet.belief_size - agent_cfg.action_size = planet.action_size + complete_agent_cfg(model_env, agent_cfg) + with omegaconf.open_dict(agent_cfg): + agent_cfg.latent_state_size = planet.latent_state_size + agent_cfg.belief_size = planet.belief_size + agent_cfg.action_size = planet.action_size agent = hydra.utils.instantiate(agent_cfg) - agent.planet = planet + # Not a primitive, so assigned after initialization + agent.planet_model = planet return agent From 4852125e5f7339b11d334ece3803ab93335fdcaf Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Sun, 5 Jun 2022 23:57:19 -0700 Subject: [PATCH 11/24] Finish dreamer loss --- mbrl/algorithms/dreamer.py | 8 +- mbrl/examples/conf/algorithm/dreamer.yaml | 1 + mbrl/planning/dreamer_agent.py | 183 ++++++++++++---------- 3 files changed, 105 insertions(+), 87 deletions(-) diff --git a/mbrl/algorithms/dreamer.py b/mbrl/algorithms/dreamer.py index 55c9e1f7..7134ca95 100644 --- a/mbrl/algorithms/dreamer.py +++ b/mbrl/algorithms/dreamer.py @@ -166,9 +166,7 @@ def is_test_episode(episode_): trainer.train( dataset, num_epochs=1, batch_callback=model_batch_callback, evaluate=False ) - agent.train( - dataset, num_epochs=1, batch_callback=agent_batch_callback, evaluate=False - ) + agent.train(dataset, num_epochs=1, batch_callback=agent_batch_callback) planet.save(work_dir) agent.save(work_dir) replay_buffer.save(work_dir) @@ -191,7 +189,9 @@ def is_test_episode(episode_): else cfg.overrides.action_noise_std * np_rng.standard_normal(env.action_space.shape[0]) ) - action = agent.act(latent_state) + action_noise + action = agent.act(latent_state) + action = action.detach().cpu().squeeze(0).numpy() + action = action + action_noise action = np.clip( action, -1.0, 1.0, dtype=env.action_space.dtype ) # to account for the noise and fix dtype diff --git a/mbrl/examples/conf/algorithm/dreamer.yaml b/mbrl/examples/conf/algorithm/dreamer.yaml index c24d9e9a..b9116bf7 100644 --- a/mbrl/examples/conf/algorithm/dreamer.yaml +++ b/mbrl/examples/conf/algorithm/dreamer.yaml @@ -15,6 +15,7 @@ agent: init_std: 5 mean_scale: 5 activation_function: "elu" + device: ${device} num_initial_trajectories: 5 action_noise_std: 0.3 diff --git a/mbrl/planning/dreamer_agent.py b/mbrl/planning/dreamer_agent.py index ed94a1be..b9c16483 100644 --- a/mbrl/planning/dreamer_agent.py +++ b/mbrl/planning/dreamer_agent.py @@ -3,12 +3,13 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import pathlib -from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union +from typing import Callable, Dict, Optional, Sequence, Union import hydra import numpy as np import omegaconf import torch +import tqdm from torch import nn from torch.distributions import ( Independent, @@ -58,8 +59,8 @@ def __init__( self.mean_scale = mean_scale self.raw_init_std = np.log(np.exp(self.init_std) - 1) - def forward(self, belief, state): - hidden = self.act_fn(self.fc1(torch.cat([belief, state], dim=1))) + def forward(self, belief, latent): + hidden = self.act_fn(self.fc1(torch.cat([belief, latent], dim=-1))) hidden = self.act_fn(self.fc2(hidden)) model_out = self.fc3(hidden).squeeze(dim=1) mean, std = torch.chunk(model_out, 2, -1) @@ -79,8 +80,8 @@ def __init__(self, latent_size, hidden_size, activation_function="elu"): self.fc2 = nn.Linear(hidden_size, hidden_size) self.fc3 = nn.Linear(hidden_size, 1) - def forward(self, belief, state): - hidden = self.act_fn(self.fc1(torch.cat([belief, state], dim=1))) + def forward(self, belief, latent): + hidden = self.act_fn(self.fc1(torch.cat([belief, latent], dim=-1))) hidden = self.act_fn(self.fc2(hidden)) value = self.fc3(hidden).squeeze(dim=1) return value @@ -135,6 +136,9 @@ def __init__( ).to(device) self.critic_optim = Adam(self.critic.parameters(), critic_lr) + def parameters(self): + return list(self.policy.parameters()) + list(self.critic.parameters()) + def act( self, obs: Dict[str, TensorType], training: bool = True, **_kwargs ) -> TensorType: @@ -143,98 +147,111 @@ def act( action = action_dist.rsample() else: action = action_dist.mode() - return action.cpu().detach().numpy() + return action def train( self, dataset_train: TransitionIterator, - dataset_val: Optional[TransitionIterator] = None, num_epochs: Optional[int] = None, - patience: Optional[int] = None, - improvement_threshold: float = 0.01, - callback: Optional[Callable] = None, batch_callback: Optional[Callable] = None, - evaluate: bool = True, silent: bool = False, - ) -> Tuple[List[float], List[float]]: - - raise NotImplementedError - """ - eval_dataset = dataset_train if dataset_val is None else dataset_val + ) -> None: - training_losses, val_scores = [], [] - best_weights: Optional[Dict] = None - epoch_iter = range(num_epochs) if num_epochs else itertools.count() - epochs_since_update = 0 - best_val_score = self.evaluate(eval_dataset) if evaluate else None # only enable tqdm if training for a single epoch, # otherwise it produces too much output disable_tqdm = silent or (num_epochs is None or num_epochs > 1) - for batch in dataset_train: - B, L, _ = beliefs.shape - beliefs = torch.reshape(beliefs, [B * L, -1]) - states = torch.reshape(states, [B * L, -1]) - imag_beliefs = [] - imag_states = [] - imag_actions = [] - imag_rewards = [] - for _ in range(self.horizon): - state = {"belief": beliefs, "latent": states} - actions = self.act(beliefs, states) - imag_beliefs.append(beliefs) - imag_states.append(states) - imag_actions.append(actions) - - states, rewards = self.planet_model.sample(actions, states) - imag_rewards.append(rewards) - - # I x (B*L) x _ - imag_beliefs = torch.stack(imag_beliefs).to(self.device) - imag_states = torch.stack(imag_states).to(self.device) - imag_actions = torch.stack(imag_actions).to(self.device) - freeze(self.critic) - imag_values = self.critic({"belief": imag_beliefs, "latent": imag_states}) - unfreeze(self.critic) - - discount_arr = self.gamma * torch.ones_like(imag_rewards) - returns = self._compute_return( - imag_rewards[:-1], - imag_values[:-1], - discount_arr[:-1], - bootstrap=imag_values[-1], - lambda_=self.lam, - ) - # Make the top row 1 so the cumulative product starts with discount^0 - discount_arr = torch.cat( - [torch.ones_like(discount_arr[:1]), discount_arr[1:]] - ) - discount = torch.cumprod(discount_arr[:-1], 0) - policy_loss = -torch.mean(discount * returns) - - # Detach tensors which have gradients through policy model for value loss - value_beliefs = imag_beliefs.detach()[:-1] - value_states = imag_states.detach()[:-1] - value_discount = discount.detach() - value_target = returns.detach() - state = {"belief": value_beliefs, "latent": value_states} - value_pred = self.critic(state) - value_loss = F.mse_loss(value_discount * value_target, value_pred) - - self.policy_optim.zero_grad() - self.critic_optim.zero_grad() - - nn.utils.clip_grad_norm_( - self.policy_model.parameters(), self.grad_clip_norm - ) - nn.utils.clip_grad_norm_(self.value_model.parameters(), self.grad_clip_norm) + meta = {} - policy_loss.backward() - value_loss.backward() + for batch in tqdm.tqdm(dataset_train, disable=disable_tqdm): + obs, actions, rewards = self.planet_model._process_batch( + batch, + pixel_obs=True, + ) - self.policy_optim.step() - self.critic_optim.step() - """ + ( + _, + _, + _, + latents, + beliefs, + _, + rewards, + ) = self.planet_model(obs[:, 1:], actions[:, :-1], rewards[:, :-1]) + + for epoch in range(num_epochs): + B, L, _ = beliefs.shape + beliefs = torch.reshape(beliefs, [B * L, -1]) + latents = torch.reshape(latents, [B * L, -1]) + states = {"belief": beliefs, "latent": latents} + imag_beliefs = [] + imag_latents = [] + imag_actions = [] + imag_rewards = [] + for _ in range(self.horizon): + actions = self.act(states) + imag_beliefs.append(states["belief"]) + imag_latents.append(states["latent"]) + imag_actions.append(actions) + + _, rewards, _, states = self.planet_model.sample(actions, states) + imag_rewards.append(rewards) + + # I x (B*L) x _ + imag_beliefs = torch.stack(imag_beliefs).to(self.device) + imag_latents = torch.stack(imag_latents).to(self.device) + imag_actions = torch.stack(imag_actions).to(self.device) + freeze(self.critic) + imag_values = self.critic(imag_beliefs, imag_latents) + unfreeze(self.critic) + + imag_rewards = torch.stack(imag_rewards).to(self.device) + discount_arr = self.gamma * torch.ones_like(imag_rewards) + returns = self._compute_return( + imag_rewards[:-1], + imag_values[:-1], + discount_arr[:-1], + bootstrap=imag_values[-1], + lambda_=self.lam, + ) + # Make the top row 1 so the cumulative product starts with discount^0 + discount_arr = torch.cat( + [torch.ones_like(discount_arr[:1]), discount_arr[1:]] + ) + discount = torch.cumprod(discount_arr[:-1], 0) + policy_loss = -torch.mean(discount * returns) + + # Detach tensors which have gradients through policy model for value loss + value_beliefs = imag_beliefs.detach()[:-1] # type: ignore + value_latents = imag_latents.detach()[:-1] # type: ignore + value_discount = discount.detach() + value_target = returns.detach() + value_pred = self.critic(value_beliefs, value_latents) + critic_loss = F.mse_loss(value_discount * value_target, value_pred) + + self.policy_optim.zero_grad() + self.critic_optim.zero_grad() + + nn.utils.clip_grad_norm_(self.policy.parameters(), self.grad_clip_norm) + nn.utils.clip_grad_norm_(self.critic.parameters(), self.grad_clip_norm) + + policy_loss.backward() + critic_loss.backward() + + meta["policy_loss"] = policy_loss.item() + meta["critic_loss"] = critic_loss.item() + + with torch.no_grad(): + grad_norm = 0.0 + for p in list( + filter(lambda p: p.grad is not None, self.parameters()) + ): + grad_norm += p.grad.data.norm(2).item() + meta["grad_norm"] = grad_norm + + self.policy_optim.step() + self.critic_optim.step() + batch_callback(epoch, None, meta, "train") def save(self, save_dir: Union[str, pathlib.Path]): """Saves the agent to the given directory.""" From d54a0e17b05245c0a8c133c0f76043b65a31a904 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Sun, 5 Jun 2022 23:59:52 -0700 Subject: [PATCH 12/24] Add Dreamer to README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 967f1e22..ae8f2a72 100644 --- a/README.md +++ b/README.md @@ -51,8 +51,8 @@ as examples of how to use this library. You can find them in the [mbrl/algorithms](https://github.com/facebookresearch/mbrl-lib/tree/main/mbrl/algorithms) folder. Currently, we have implemented [PETS](https://github.com/facebookresearch/mbrl-lib/tree/main/mbrl/algorithms/pets.py), [MBPO](https://github.com/facebookresearch/mbrl-lib/tree/main/mbrl/algorithms/mbpo.py), -[PlaNet](https://github.com/facebookresearch/mbrl-lib/tree/main/mbrl/algorithms/planet.py), -we plan to keep increasing this list in the future. +[PlaNet](https://github.com/facebookresearch/mbrl-lib/tree/main/mbrl/algorithms/planet.py), +[Dreamer](https://github.com/facebookresearch/mbrl-lib/tree/main/mbrl/algorithms/dreamer.py), we plan to keep increasing this list in the future. The implementations rely on [Hydra](https://github.com/facebookresearch/hydra) to handle configuration. You can see the configuration files in From 64441ca64a26136f9f8ddb27209991ac260fb81d Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 7 Jun 2022 09:34:03 -0700 Subject: [PATCH 13/24] Added config yamls --- .../overrides/dreamer_cartpole_balance.yaml | 30 +++++++++++++++++++ .../conf/overrides/dreamer_cheetah_run.yaml | 30 +++++++++++++++++++ .../conf/overrides/dreamer_walker_run.yaml | 30 +++++++++++++++++++ .../conf/overrides/dreamer_walker_stand.yaml | 30 +++++++++++++++++++ 4 files changed, 120 insertions(+) create mode 100644 mbrl/examples/conf/overrides/dreamer_cartpole_balance.yaml create mode 100644 mbrl/examples/conf/overrides/dreamer_cheetah_run.yaml create mode 100644 mbrl/examples/conf/overrides/dreamer_walker_run.yaml create mode 100644 mbrl/examples/conf/overrides/dreamer_walker_stand.yaml diff --git a/mbrl/examples/conf/overrides/dreamer_cartpole_balance.yaml b/mbrl/examples/conf/overrides/dreamer_cartpole_balance.yaml new file mode 100644 index 00000000..fb4f68e9 --- /dev/null +++ b/mbrl/examples/conf/overrides/dreamer_cartpole_balance.yaml @@ -0,0 +1,30 @@ +# @package _group_ +env: "dmcontrol_cartpole_balance" # used to set the hydra dir, ignored otherwise + +env_cfg: + _target_: "mbrl.third_party.dmc2gym.wrappers.DMCWrapper" + domain_name: "cartpole" + task_name: "balance" + task_kwargs: + random: ${seed} + visualize_reward: false + from_pixels: true + height: 64 + width: 64 + frame_skip: 2 + bit_depth: 5 + +term_fn: "no_termination" + +# General configuration overrides +trial_length: 500 +action_noise_std: 0.3 + +# Model overrides +num_grad_updates: 1 # Why is this 100 for PlaNet? +sequence_length: 50 +batch_size: 50 +free_nats: 3 +kl_scale: 1.0 + +# Dreamer configuration overrides diff --git a/mbrl/examples/conf/overrides/dreamer_cheetah_run.yaml b/mbrl/examples/conf/overrides/dreamer_cheetah_run.yaml new file mode 100644 index 00000000..f8ef8b9b --- /dev/null +++ b/mbrl/examples/conf/overrides/dreamer_cheetah_run.yaml @@ -0,0 +1,30 @@ +# @package _group_ +env: "dmcontrol_cheetah_run" # used to set the hydra dir, ignored otherwise + +env_cfg: + _target_: "mbrl.third_party.dmc2gym.wrappers.DMCWrapper" + domain_name: "cheetah" + task_name: "run" + task_kwargs: + random: ${seed} + visualize_reward: false + from_pixels: true + height: 64 + width: 64 + frame_skip: 2 + bit_depth: 5 + +term_fn: "no_termination" + +# General configuration overrides +trial_length: 500 +action_noise_std: 0.3 + +# Model overrides +num_grad_updates: 1 # Why is this 100 for PlaNet? +sequence_length: 50 +batch_size: 50 +free_nats: 3 +kl_scale: 1.0 + +# Dreamer configuration overrides diff --git a/mbrl/examples/conf/overrides/dreamer_walker_run.yaml b/mbrl/examples/conf/overrides/dreamer_walker_run.yaml new file mode 100644 index 00000000..5d13f314 --- /dev/null +++ b/mbrl/examples/conf/overrides/dreamer_walker_run.yaml @@ -0,0 +1,30 @@ +# @package _group_ +env: "dmcontrol_walker_run" # used to set the hydra dir, ignored otherwise + +env_cfg: + _target_: "mbrl.third_party.dmc2gym.wrappers.DMCWrapper" + domain_name: "walker" + task_name: "run" + task_kwargs: + random: ${seed} + visualize_reward: false + from_pixels: true + height: 64 + width: 64 + frame_skip: 2 + bit_depth: 5 + +term_fn: "no_termination" + +# General configuration overrides +trial_length: 500 +action_noise_std: 0.3 + +# Model overrides +num_grad_updates: 1 # Why is this 100 for PlaNet? +sequence_length: 50 +batch_size: 50 +free_nats: 3 +kl_scale: 1.0 + +# Dreamer configuration overrides diff --git a/mbrl/examples/conf/overrides/dreamer_walker_stand.yaml b/mbrl/examples/conf/overrides/dreamer_walker_stand.yaml new file mode 100644 index 00000000..3fda4db9 --- /dev/null +++ b/mbrl/examples/conf/overrides/dreamer_walker_stand.yaml @@ -0,0 +1,30 @@ +# @package _group_ +env: "dmcontrol_walker_stand" # used to set the hydra dir, ignored otherwise + +env_cfg: + _target_: "mbrl.third_party.dmc2gym.wrappers.DMCWrapper" + domain_name: "walker" + task_name: "stand" + task_kwargs: + random: ${seed} + visualize_reward: false + from_pixels: true + height: 64 + width: 64 + frame_skip: 2 + bit_depth: 5 + +term_fn: "no_termination" + +# General configuration overrides +trial_length: 500 +action_noise_std: 0.3 + +# Model overrides +num_grad_updates: 1 # Why is this 100 for PlaNet? +sequence_length: 50 +batch_size: 50 +free_nats: 3 +kl_scale: 1.0 + +# Dreamer configuration overrides From 3ffcf2d2ee24aacb6466bcee668f1f76779db985 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Sat, 11 Jun 2022 12:29:03 -0700 Subject: [PATCH 14/24] rename pyproject --- .pre-commit-config.yaml | 1 - pyproyect.toml => pyproject.toml | 0 2 files changed, 1 deletion(-) rename pyproyect.toml => pyproject.toml (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1a57593b..7305d0bc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,6 @@ repos: hooks: - id: black files: 'mbrl' - language_version: python3.7 - repo: https://gitlab.com/pycqa/flake8 rev: 3.9.2 diff --git a/pyproyect.toml b/pyproject.toml similarity index 100% rename from pyproyect.toml rename to pyproject.toml From 656a6cee774feac1b98e29a832b5b5fc2d255a10 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Sat, 11 Jun 2022 22:49:38 -0700 Subject: [PATCH 15/24] Make saving replay buffer optional --- mbrl/algorithms/dreamer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mbrl/algorithms/dreamer.py b/mbrl/algorithms/dreamer.py index 7134ca95..bf6a21d8 100644 --- a/mbrl/algorithms/dreamer.py +++ b/mbrl/algorithms/dreamer.py @@ -169,7 +169,8 @@ def is_test_episode(episode_): agent.train(dataset, num_epochs=1, batch_callback=agent_batch_callback) planet.save(work_dir) agent.save(work_dir) - replay_buffer.save(work_dir) + if cfg.overrides.get("save_replay_buffer", False): + replay_buffer.save(work_dir) metrics = get_metrics_and_clear_metric_containers() logger.log_data("metrics", metrics) From 03d383d84a7415ae2ff712e12b0b63f6f87f11c6 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Sun, 12 Jun 2022 01:32:00 -0700 Subject: [PATCH 16/24] drop deprecation test --- mbrl/algorithms/planet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mbrl/algorithms/planet.py b/mbrl/algorithms/planet.py index 120f5e9d..4729d6fd 100644 --- a/mbrl/algorithms/planet.py +++ b/mbrl/algorithms/planet.py @@ -145,7 +145,8 @@ def is_test_episode(episode_): dataset, num_epochs=1, batch_callback=batch_callback, evaluate=False ) planet.save(work_dir) - replay_buffer.save(work_dir) + if cfg.overrides.get("save_replay_buffer", False): + replay_buffer.save(work_dir) metrics = get_metrics_and_clear_metric_containers() logger.log_data("metrics", metrics) From da7f83cad2a04642104e0a91cc52d6227ff6f897 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Sun, 12 Jun 2022 01:48:53 -0700 Subject: [PATCH 17/24] Fix num_grad_updates --- mbrl/examples/conf/overrides/dreamer_cartpole_balance.yaml | 2 +- mbrl/examples/conf/overrides/dreamer_cheetah_run.yaml | 2 +- mbrl/examples/conf/overrides/dreamer_walker_run.yaml | 2 +- mbrl/examples/conf/overrides/dreamer_walker_stand.yaml | 2 +- mbrl/examples/conf/overrides/dreamer_walker_walk.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mbrl/examples/conf/overrides/dreamer_cartpole_balance.yaml b/mbrl/examples/conf/overrides/dreamer_cartpole_balance.yaml index fb4f68e9..d7a0c54a 100644 --- a/mbrl/examples/conf/overrides/dreamer_cartpole_balance.yaml +++ b/mbrl/examples/conf/overrides/dreamer_cartpole_balance.yaml @@ -21,7 +21,7 @@ trial_length: 500 action_noise_std: 0.3 # Model overrides -num_grad_updates: 1 # Why is this 100 for PlaNet? +num_grad_updates: 100 sequence_length: 50 batch_size: 50 free_nats: 3 diff --git a/mbrl/examples/conf/overrides/dreamer_cheetah_run.yaml b/mbrl/examples/conf/overrides/dreamer_cheetah_run.yaml index f8ef8b9b..afb7ca5a 100644 --- a/mbrl/examples/conf/overrides/dreamer_cheetah_run.yaml +++ b/mbrl/examples/conf/overrides/dreamer_cheetah_run.yaml @@ -21,7 +21,7 @@ trial_length: 500 action_noise_std: 0.3 # Model overrides -num_grad_updates: 1 # Why is this 100 for PlaNet? +num_grad_updates: 100 sequence_length: 50 batch_size: 50 free_nats: 3 diff --git a/mbrl/examples/conf/overrides/dreamer_walker_run.yaml b/mbrl/examples/conf/overrides/dreamer_walker_run.yaml index 5d13f314..24708f28 100644 --- a/mbrl/examples/conf/overrides/dreamer_walker_run.yaml +++ b/mbrl/examples/conf/overrides/dreamer_walker_run.yaml @@ -21,7 +21,7 @@ trial_length: 500 action_noise_std: 0.3 # Model overrides -num_grad_updates: 1 # Why is this 100 for PlaNet? +num_grad_updates: 100 sequence_length: 50 batch_size: 50 free_nats: 3 diff --git a/mbrl/examples/conf/overrides/dreamer_walker_stand.yaml b/mbrl/examples/conf/overrides/dreamer_walker_stand.yaml index 3fda4db9..3022e68b 100644 --- a/mbrl/examples/conf/overrides/dreamer_walker_stand.yaml +++ b/mbrl/examples/conf/overrides/dreamer_walker_stand.yaml @@ -21,7 +21,7 @@ trial_length: 500 action_noise_std: 0.3 # Model overrides -num_grad_updates: 1 # Why is this 100 for PlaNet? +num_grad_updates: 100 sequence_length: 50 batch_size: 50 free_nats: 3 diff --git a/mbrl/examples/conf/overrides/dreamer_walker_walk.yaml b/mbrl/examples/conf/overrides/dreamer_walker_walk.yaml index c698bacb..501f27d0 100644 --- a/mbrl/examples/conf/overrides/dreamer_walker_walk.yaml +++ b/mbrl/examples/conf/overrides/dreamer_walker_walk.yaml @@ -21,7 +21,7 @@ trial_length: 500 action_noise_std: 0.3 # Model overrides -num_grad_updates: 1 # Why is this 100 for PlaNet? +num_grad_updates: 100 sequence_length: 50 batch_size: 50 free_nats: 3 From 3b4a00c7cc55aad8c7ddc7ea7c6a1ce6c59c8bf5 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Sun, 12 Jun 2022 02:21:40 -0700 Subject: [PATCH 18/24] Add policy and critic loss to metrics --- mbrl/algorithms/dreamer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mbrl/algorithms/dreamer.py b/mbrl/algorithms/dreamer.py index bf6a21d8..45f96bfc 100644 --- a/mbrl/algorithms/dreamer.py +++ b/mbrl/algorithms/dreamer.py @@ -29,6 +29,8 @@ ("reward_loss", "RL", "float"), ("gradient_norm", "GN", "float"), ("kl_loss", "KL", "float"), + ("policy_loss", "PL", "float"), + ("critic_loss", "CL", "float"), ] From 68d55f547e376f02bdf3f1a73eaa4ae5a456d429 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Wed, 15 Jun 2022 11:22:52 -0700 Subject: [PATCH 19/24] Freeze planet during dreamer train --- mbrl/planning/dreamer_agent.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mbrl/planning/dreamer_agent.py b/mbrl/planning/dreamer_agent.py index b9c16483..e97083e0 100644 --- a/mbrl/planning/dreamer_agent.py +++ b/mbrl/planning/dreamer_agent.py @@ -163,6 +163,8 @@ def train( meta = {} + freeze(self.planet_model) + for batch in tqdm.tqdm(dataset_train, disable=disable_tqdm): obs, actions, rewards = self.planet_model._process_batch( batch, @@ -252,6 +254,7 @@ def train( self.policy_optim.step() self.critic_optim.step() batch_callback(epoch, None, meta, "train") + unfreeze(self.planet_model) def save(self, save_dir: Union[str, pathlib.Path]): """Saves the agent to the given directory.""" From 3fb69be7824f5266a70d62fb907465f8c9cb3dd2 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 2 Aug 2022 23:27:29 -0700 Subject: [PATCH 20/24] wip Signed-off-by: Rohan138 --- .pre-commit-config.yaml | 2 +- requirements/dev.txt | 2 +- requirements/main.txt | 2 +- setup.cfg | 1 - setup.py | 3 +++ 5 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7305d0bc..a59f15ab 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 22.1.0 + rev: 22.3.0 hooks: - id: black files: 'mbrl' diff --git a/requirements/dev.txt b/requirements/dev.txt index 5784d729..e7773d0b 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -3,7 +3,7 @@ nbsphinx>=0.8.0 sphinx-rtd-theme>=0.5.0 flake8>=3.8.4 mypy>=0.902 -black>=21.4b2 +black>=22.3.0 pytest>=6.0.1 types-pyyaml>=0.1.6 types-termcolor>=0.1.0 diff --git a/requirements/main.txt b/requirements/main.txt index cdae3633..a5509bc3 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -5,7 +5,7 @@ tensorboard>=2.4.0 imageio>=2.9.0 numpy>=1.19.1 matplotlib>=3.3.1 -gym==0.17.2 +gym>=0.20.0,<0.25.0 jupyter>=1.0.0 pytest>=6.0.1 sk-video>=1.1.10 diff --git a/setup.cfg b/setup.cfg index 0704346f..dbf4d08c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,7 +11,6 @@ exclude = mbrl/third_party/* [mypy] -python_version = 3.7 ignore_missing_imports = True show_error_codes = True strict_optional = False diff --git a/setup.py b/setup.py index e260abe9..a3c0ce00 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,9 @@ def parse_requirements_file(path): "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], From 3215e6eade397a2f40208345bb34bb4b3eb99c2f Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 2 Aug 2022 23:31:30 -0700 Subject: [PATCH 21/24] wip Signed-off-by: Rohan138 --- setup.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/setup.py b/setup.py index a3c0ce00..2edae67a 100644 --- a/setup.py +++ b/setup.py @@ -32,10 +32,6 @@ def parse_requirements_file(path): classifiers=[ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], From 65cfa05308b49ce30d2116fc62b97f515b71d564 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 2 Aug 2022 23:31:44 -0700 Subject: [PATCH 22/24] wip Signed-off-by: Rohan138 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 2edae67a..e260abe9 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ def parse_requirements_file(path): classifiers=[ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], From b5beaa49068168e825c205845e715843fd586fbe Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Wed, 3 Aug 2022 00:10:52 -0700 Subject: [PATCH 23/24] wip Signed-off-by: Rohan138 --- mbrl/planning/dreamer_agent.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/mbrl/planning/dreamer_agent.py b/mbrl/planning/dreamer_agent.py index e97083e0..e25bd836 100644 --- a/mbrl/planning/dreamer_agent.py +++ b/mbrl/planning/dreamer_agent.py @@ -183,13 +183,14 @@ def train( for epoch in range(num_epochs): B, L, _ = beliefs.shape - beliefs = torch.reshape(beliefs, [B * L, -1]) - latents = torch.reshape(latents, [B * L, -1]) - states = {"belief": beliefs, "latent": latents} imag_beliefs = [] imag_latents = [] imag_actions = [] imag_rewards = [] + states = { + "belief": beliefs.reshape(B * L, -1), + "latent": latents.reshape(B * L, -1), + } for _ in range(self.horizon): actions = self.act(states) imag_beliefs.append(states["belief"]) @@ -200,14 +201,13 @@ def train( imag_rewards.append(rewards) # I x (B*L) x _ - imag_beliefs = torch.stack(imag_beliefs).to(self.device) - imag_latents = torch.stack(imag_latents).to(self.device) - imag_actions = torch.stack(imag_actions).to(self.device) - freeze(self.critic) - imag_values = self.critic(imag_beliefs, imag_latents) - unfreeze(self.critic) - - imag_rewards = torch.stack(imag_rewards).to(self.device) + imag_beliefs = torch.stack(imag_beliefs) + imag_latents = torch.stack(imag_latents) + imag_actions = torch.stack(imag_actions) + with torch.no_grad(): + imag_values = self.critic(imag_beliefs, imag_latents) + + imag_rewards = torch.stack(imag_rewards) discount_arr = self.gamma * torch.ones_like(imag_rewards) returns = self._compute_return( imag_rewards[:-1], @@ -303,10 +303,9 @@ def _compute_return( """ next_values = torch.cat([value[1:], bootstrap[None]], 0) target = reward + discount * next_values * (1 - lambda_) - timesteps = list(range(reward.shape[0] - 1, -1, -1)) outputs = [] accumulated_reward = bootstrap - for t in timesteps: + for t in range(reward.shape[0] - 1, -1, -1): inp = target[t] discount_factor = discount[t] accumulated_reward = inp + discount_factor * lambda_ * accumulated_reward From 71b54eb6097e6c2bf8b3d023ea3c98b8b49a3d31 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Wed, 3 Aug 2022 00:15:44 -0700 Subject: [PATCH 24/24] wip Signed-off-by: Rohan138 --- mbrl/examples/conf/algorithm/dreamer.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mbrl/examples/conf/algorithm/dreamer.yaml b/mbrl/examples/conf/algorithm/dreamer.yaml index b9116bf7..fa42a8c0 100644 --- a/mbrl/examples/conf/algorithm/dreamer.yaml +++ b/mbrl/examples/conf/algorithm/dreamer.yaml @@ -21,4 +21,4 @@ num_initial_trajectories: 5 action_noise_std: 0.3 test_frequency: 25 num_episodes: 1000 -dataset_size: 1000000 \ No newline at end of file +dataset_size: 1000000