-
Notifications
You must be signed in to change notification settings - Fork 2k
Open
Labels
custom gym envIssue related to Custom Gym EnvIssue related to Custom Gym Env
Description
🐛 Bug
Hello, I use SB3 along with Isaac-lab to train my agent. My SB3 is installed using ./isaaclab.sh --install sb3
, and my IsaacLab version is cloned from araffin fork/feat/sb3-optim .
During my test, I found the same problem as described in #1966
It can be seen from my profiled code in the log output that, some memory is not released correctly and lead to memory exploding.
To Reproduce
# Copyright (c) 2022-2025, The Isaac Lab Project Developers.
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
"""
Script to train RL agent with Stable Baselines3.
Example: ./isaaclab.sh -p scripts/reinforcement_learning/sb3/train.py
--task Isaac-Velocity-Flat-Unitree-A1-v0 --num_envs 2048 --headless --seed 2
"""
"""Launch Isaac Sim Simulator first."""
import argparse
import contextlib
import signal
import sys
from pathlib import Path
from isaaclab.app import AppLauncher
# add argparse arguments
parser = argparse.ArgumentParser(description="Train an RL agent with Stable-Baselines3.")
parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
parser.add_argument("--task", type=str, default=None, help="Name of the task.")
parser.add_argument("--seed", type=int, default=None, help="Seed used for the environment")
parser.add_argument("--log_interval", type=int, default=100_000, help="Log data every n timesteps.")
parser.add_argument("--max_iterations", type=int, default=None, help="RL Policy training iterations.")
parser.add_argument(
"--keep_all_info",
action="store_true",
default=False,
help="Use a slower SB3 wrapper but keep all the extra training info.",
)
# append AppLauncher cli args
AppLauncher.add_app_launcher_args(parser)
# parse the arguments
args_cli, hydra_args = parser.parse_known_args()
# always enable cameras to record video
if args_cli.video:
args_cli.enable_cameras = True
# clear out sys.argv for Hydra
sys.argv = [sys.argv[0]] + hydra_args
# launch omniverse app
app_launcher = AppLauncher(args_cli)
simulation_app = app_launcher.app
def cleanup_pbar(*args):
"""
A small helper to stop training and
cleanup progress bar properly on ctrl+c
"""
import gc
tqdm_objects = [obj for obj in gc.get_objects() if "tqdm" in type(obj).__name__]
for tqdm_object in tqdm_objects:
if "tqdm_rich" in type(tqdm_object).__name__:
tqdm_object.close()
raise KeyboardInterrupt
# disable KeyboardInterrupt override
signal.signal(signal.SIGINT, cleanup_pbar)
"""Rest everything follows."""
import gymnasium as gym
import numpy as np
import os
import random
from datetime import datetime
from stable_baselines3 import PPO
# from e3catch.tasks.direct.learning.policy.ppo_netfree import E3CatchPPO
from stable_baselines3.common.callbacks import CheckpointCallback, LogEveryNTimesteps
from stable_baselines3.common.vec_env import VecNormalize
from isaaclab.envs import (
DirectMARLEnv,
DirectMARLEnvCfg,
DirectRLEnvCfg,
ManagerBasedRLEnvCfg,
multi_agent_to_single_agent,
)
from isaaclab.utils.dict import print_dict
from isaaclab.utils.io import dump_pickle, dump_yaml
from isaaclab_rl.sb3 import Sb3VecEnvWrapper, process_sb3_cfg
import isaaclab_tasks # noqa: F401
from isaaclab_tasks.utils.hydra import hydra_task_config
import e3catch.tasks # noqa: F401
@hydra_task_config(args_cli.task, "sb3_cfg_entry_point")
def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agent_cfg: dict):
"""Train with stable-baselines agent."""
# randomly sample a seed if seed = -1
if args_cli.seed == -1:
args_cli.seed = random.randint(0, 10000)
# override configurations with non-hydra CLI arguments
env_cfg.scene.num_envs = args_cli.num_envs if args_cli.num_envs is not None else env_cfg.scene.num_envs
agent_cfg["seed"] = args_cli.seed if args_cli.seed is not None else agent_cfg["seed"]
# max iterations for training
if args_cli.max_iterations is not None:
agent_cfg["n_timesteps"] = args_cli.max_iterations * agent_cfg["n_steps"] * env_cfg.scene.num_envs
# set the environment seed
# note: certain randomizations occur in the environment initialization so we set the seed here
env_cfg.seed = agent_cfg["seed"]
env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device
# directory for logging into
run_info = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_root_path = os.path.abspath(os.path.join("logs", "sb3", args_cli.task))
print(f"[INFO] Logging experiment in directory: {log_root_path}")
print(f"Exact experiment name requested from command line: {run_info}")
log_dir = os.path.join(log_root_path, run_info)
# dump the configuration into log-directory
dump_yaml(os.path.join(log_dir, "params", "env.yaml"), env_cfg)
dump_yaml(os.path.join(log_dir, "params", "agent.yaml"), agent_cfg)
dump_pickle(os.path.join(log_dir, "params", "env.pkl"), env_cfg)
dump_pickle(os.path.join(log_dir, "params", "agent.pkl"), agent_cfg)
# save command used to run the script
command = " ".join(sys.orig_argv)
(Path(log_dir) / "command.txt").write_text(command)
# post-process agent configuration
agent_cfg = process_sb3_cfg(agent_cfg)
# read configurations about the agent-training
policy_arch = agent_cfg.pop("policy")
n_timesteps = agent_cfg.pop("n_timesteps")
# create isaac environment
env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
# convert to single-agent instance if required by the RL algorithm
if isinstance(env.unwrapped, DirectMARLEnv):
env = multi_agent_to_single_agent(env)
# wrap for video recording
if args_cli.video:
video_kwargs = {
"video_folder": os.path.join(log_dir, "videos", "train"),
"step_trigger": lambda step: step % args_cli.video_interval == 0,
"video_length": args_cli.video_length,
"disable_logger": True,
}
print("[INFO] Recording videos during training.")
print_dict(video_kwargs, nesting=4)
env = gym.wrappers.RecordVideo(env, **video_kwargs)
# wrap around environment for stable baselines
env = Sb3VecEnvWrapper(env, fast_variant=not args_cli.keep_all_info)
if "normalize_input" in agent_cfg:
env = VecNormalize(
env,
training=True,
norm_obs="normalize_input" in agent_cfg and agent_cfg.pop("normalize_input"),
norm_reward="normalize_value" in agent_cfg and agent_cfg.pop("normalize_value"),
clip_obs="clip_obs" in agent_cfg and agent_cfg.pop("clip_obs"),
gamma=agent_cfg["gamma"],
clip_reward=np.inf,
)
# create agent from stable baselines
agent = PPO(policy_arch, env, verbose=1, tensorboard_log=log_dir, **agent_cfg)
# callbacks for agent
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_dir, name_prefix="model", verbose=2)
callbacks = [checkpoint_callback, LogEveryNTimesteps(n_steps=args_cli.log_interval)]
env.unwrapped.cfg.run_type = "train" # set run type to play
# train the agent
with contextlib.suppress(KeyboardInterrupt):
agent.learn(
total_timesteps=n_timesteps,
callback=callbacks,
progress_bar=True,
log_interval=None,
)
# save the final model
agent.save(os.path.join(log_dir, "model"))
print("Saving to:")
print(os.path.join(log_dir, "model.zip"))
if isinstance(env, VecNormalize):
print("Saving normalization")
env.save(os.path.join(log_dir, "model_vecnormalize.pkl"))
# close the simulator
env.close()
if __name__ == "__main__":
# run the main function
main()
# close sim app
simulation_app.close()
The reproduction step should be, where Template-StateCatch-Direct-v0
is my custom env and task
python e3catch/scripts/sb3/train.py --task=Template-StateCatch-Direct-v0 --headless
Relevant log output / Error message
Line # Mem usage Increment Occurrences Line Contents
=============================================================
463 3806.3 MiB 3806.3 MiB 1 @profile
464 def collect_rollouts(
465 self,
466 env: VecEnv,
467 callback: BaseCallback,
468 rollout_buffer: RolloutBuffer,
469 n_rollout_steps: int,
470 ) -> bool:
471 """
472 Collect experiences using the current policy and fill a ``RolloutBuffer``.
473 The term rollout here refers to the model-free notion and should not
474 be used with the concept of rollout used in model-based RL or planning.
475
476 :param env: The training environment
477 :param callback: Callback that will be called at each step
478 (and at the beginning and end of the rollout)
479 :param rollout_buffer: Buffer to fill with rollouts
480 :param n_rollout_steps: Number of experiences to collect per environment
481 :return: True if function returned with at least `n_rollout_steps`
482 collected, False if callback terminated rollout prematurely.
483 """
484 3806.3 MiB 0.0 MiB 1 assert self._last_obs is not None, "No previous observation was provided"
485 # Switch to eval mode (this affects batch norm / dropout)
486 3806.3 MiB 0.0 MiB 1 self.policy.set_training_mode(False)
487
488 3806.3 MiB 0.0 MiB 1 n_steps = 0
489 3738.8 MiB -67.5 MiB 1 rollout_buffer.reset()
490 # Sample new weights for the state dependent exploration
491 3738.8 MiB 0.0 MiB 1 if self.use_sde:
492 self.policy.reset_noise(env.num_envs)
493
494 3738.8 MiB 0.0 MiB 1 callback.on_rollout_start()
495
496 3848.8 MiB 0.0 MiB 17 while n_steps < n_rollout_steps:
497 3840.3 MiB 0.0 MiB 16 if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
498 # Sample a new noise matrix
499 self.policy.reset_noise(env.num_envs)
500
501 3840.3 MiB 0.0 MiB 32 with th.no_grad():
502 # Convert to pytorch tensor or to TensorDict
503 3840.3 MiB 0.0 MiB 16 obs_tensor = obs_as_tensor(self._last_obs, self.device)
504 3840.3 MiB 0.0 MiB 16 actions, values, log_probs = self.policy(obs_tensor)
505 3840.3 MiB 0.0 MiB 16 actions = actions.cpu().numpy()
506
507 # Rescale and perform action
508 3840.3 MiB 0.0 MiB 16 clipped_actions = actions
509
510 3840.3 MiB 0.0 MiB 16 if isinstance(self.action_space, spaces.Box):
511 3840.3 MiB 0.0 MiB 16 if self.policy.squash_output:
512 # Unscale the actions to match env bounds
513 # if they were previously squashed (scaled in [-1, 1])
514 clipped_actions = self.policy.unscale_action(clipped_actions)
515 else:
516 # Otherwise, clip the actions to avoid out of bound error
517 # as we are sampling from an unbounded Gaussian distribution
518 3840.3 MiB 0.0 MiB 16 clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
519
520 3844.7 MiB 42.5 MiB 16 new_obs, rewards, dones, infos = env.step(clipped_actions)
521
522 3844.7 MiB 0.0 MiB 16 self.num_timesteps += env.num_envs
523
524 # Give access to local variables
525 3844.7 MiB 0.0 MiB 16 callback.update_locals(locals())
526 3844.7 MiB 0.2 MiB 16 if not callback.on_step():
527 return False
528
529 3844.7 MiB 0.0 MiB 16 self._update_info_buffer(infos, dones)
530 3844.7 MiB 0.0 MiB 16 n_steps += 1
531
532 3844.7 MiB 0.0 MiB 16 if isinstance(self.action_space, spaces.Discrete):
533 # Reshape in case of discrete action
534 actions = actions.reshape(-1, 1)
535
536 # Handle timeout by bootstrapping with value function
537 # see GitHub issue #633
538 3844.7 MiB 0.0 MiB 65552 for idx, done in enumerate(dones):
539 3844.7 MiB 0.0 MiB 65536 if (
540 3844.7 MiB 0.0 MiB 65536 done
541 3844.7 MiB 0.0 MiB 222 and infos[idx].get("terminal_observation") is not None
542 3844.7 MiB 0.0 MiB 111 and infos[idx].get("TimeLimit.truncated", False)
543 ):
544 3844.7 MiB 0.0 MiB 111 terminal_obs = self.policy.obs_to_tensor(infos[idx]["terminal_observation"])[0]
545 3844.7 MiB 0.0 MiB 222 with th.no_grad():
546 3844.7 MiB 0.0 MiB 111 terminal_value = self.policy.predict_values(terminal_obs)[0] # type: ignore[arg-type]
547 3844.7 MiB 0.0 MiB 111 rewards[idx] += self.gamma * terminal_value
548
549 3848.8 MiB 67.3 MiB 32 rollout_buffer.add(
550 3844.7 MiB 0.0 MiB 16 self._last_obs, # type: ignore[arg-type]
551 3844.7 MiB 0.0 MiB 16 actions,
552 3844.7 MiB 0.0 MiB 16 rewards,
553 3844.7 MiB 0.0 MiB 16 self._last_episode_starts, # type: ignore[arg-type]
554 3844.7 MiB 0.0 MiB 16 values,
555 3844.7 MiB 0.0 MiB 16 log_probs,
556 )
557 3848.8 MiB 0.0 MiB 16 self._last_obs = new_obs # type: ignore[assignment]
558 3848.8 MiB 0.0 MiB 16 self._last_episode_starts = dones
559
560 3848.8 MiB 0.0 MiB 2 with th.no_grad():
561 # Compute value for the last timestep
562 3848.8 MiB 0.0 MiB 1 values = self.policy.predict_values(obs_as_tensor(new_obs, self.device)) # type: ignore[arg-type]
563
564 3848.8 MiB 0.0 MiB 1 rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones)
565
566 3848.8 MiB 0.0 MiB 1 callback.update_locals(locals())
567
568 3848.8 MiB 0.0 MiB 1 callback.on_rollout_end()
569
570 3848.8 MiB 0.0 MiB 1 return True
Filename: /home/hjy/HJY/Codes/E3CATCH_ws/src/e3catch/source/e3catch/e3catch/tasks/direct/learning/policy/ppo_netfree.py
Line # Mem usage Increment Occurrences Line Contents
=============================================================
463 3848.9 MiB 3848.9 MiB 1 @profile
464 def collect_rollouts(
465 self,
466 env: VecEnv,
467 callback: BaseCallback,
468 rollout_buffer: RolloutBuffer,
469 n_rollout_steps: int,
470 ) -> bool:
471 """
472 Collect experiences using the current policy and fill a ``RolloutBuffer``.
473 The term rollout here refers to the model-free notion and should not
474 be used with the concept of rollout used in model-based RL or planning.
475
476 :param env: The training environment
477 :param callback: Callback that will be called at each step
478 (and at the beginning and end of the rollout)
479 :param rollout_buffer: Buffer to fill with rollouts
480 :param n_rollout_steps: Number of experiences to collect per environment
481 :return: True if function returned with at least `n_rollout_steps`
482 collected, False if callback terminated rollout prematurely.
483 """
484 3848.9 MiB 0.0 MiB 1 assert self._last_obs is not None, "No previous observation was provided"
485 # Switch to eval mode (this affects batch norm / dropout)
486 3848.9 MiB 0.0 MiB 1 self.policy.set_training_mode(False)
487
488 3848.9 MiB 0.0 MiB 1 n_steps = 0
489 3781.4 MiB -67.5 MiB 1 rollout_buffer.reset()
490 # Sample new weights for the state dependent exploration
491 3781.4 MiB 0.0 MiB 1 if self.use_sde:
492 self.policy.reset_noise(env.num_envs)
493
494 3781.4 MiB 0.0 MiB 1 callback.on_rollout_start()
495
496 3916.5 MiB 0.0 MiB 17 while n_steps < n_rollout_steps:
497 3908.0 MiB 0.0 MiB 16 if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
498 # Sample a new noise matrix
499 self.policy.reset_noise(env.num_envs)
500
501 3908.0 MiB 0.0 MiB 32 with th.no_grad():
502 # Convert to pytorch tensor or to TensorDict
503 3908.0 MiB 0.0 MiB 16 obs_tensor = obs_as_tensor(self._last_obs, self.device)
504 3908.0 MiB 0.0 MiB 16 actions, values, log_probs = self.policy(obs_tensor)
505 3908.0 MiB 0.0 MiB 16 actions = actions.cpu().numpy()
506
507 # Rescale and perform action
508 3908.0 MiB 0.0 MiB 16 clipped_actions = actions
509
510 3908.0 MiB 0.0 MiB 16 if isinstance(self.action_space, spaces.Box):
511 3908.0 MiB 0.0 MiB 16 if self.policy.squash_output:
512 # Unscale the actions to match env bounds
513 # if they were previously squashed (scaled in [-1, 1])
514 clipped_actions = self.policy.unscale_action(clipped_actions)
515 else:
516 # Otherwise, clip the actions to avoid out of bound error
517 # as we are sampling from an unbounded Gaussian distribution
518 3908.0 MiB 0.0 MiB 16 clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
519
520 3912.4 MiB 68.0 MiB 16 new_obs, rewards, dones, infos = env.step(clipped_actions)
521
522 3912.4 MiB 0.0 MiB 16 self.num_timesteps += env.num_envs
523
524 # Give access to local variables
525 3912.4 MiB 0.0 MiB 16 callback.update_locals(locals())
526 3912.4 MiB 0.0 MiB 16 if not callback.on_step():
527 return False
528
529 3912.4 MiB 0.0 MiB 16 self._update_info_buffer(infos, dones)
530 3912.4 MiB 0.0 MiB 16 n_steps += 1
531
532 3912.4 MiB 0.0 MiB 16 if isinstance(self.action_space, spaces.Discrete):
533 # Reshape in case of discrete action
534 actions = actions.reshape(-1, 1)
535
536 # Handle timeout by bootstrapping with value function
537 # see GitHub issue #633
538 3912.4 MiB 0.0 MiB 65552 for idx, done in enumerate(dones):
539 3912.4 MiB 0.0 MiB 65536 if (
540 3912.4 MiB 0.0 MiB 65536 done
541 3912.4 MiB 0.0 MiB 218 and infos[idx].get("terminal_observation") is not None
542 3912.4 MiB 0.0 MiB 109 and infos[idx].get("TimeLimit.truncated", False)
543 ):
544 3912.4 MiB 0.0 MiB 109 terminal_obs = self.policy.obs_to_tensor(infos[idx]["terminal_observation"])[0]
545 3912.4 MiB 0.0 MiB 218 with th.no_grad():
546 3912.4 MiB 0.0 MiB 109 terminal_value = self.policy.predict_values(terminal_obs)[0] # type: ignore[arg-type]
547 3912.4 MiB 0.0 MiB 109 rewards[idx] += self.gamma * terminal_value
548
549 3916.5 MiB 67.0 MiB 32 rollout_buffer.add(
550 3912.4 MiB 0.0 MiB 16 self._last_obs, # type: ignore[arg-type]
551 3912.4 MiB 0.0 MiB 16 actions,
552 3912.4 MiB 0.0 MiB 16 rewards,
553 3912.4 MiB 0.0 MiB 16 self._last_episode_starts, # type: ignore[arg-type]
554 3912.4 MiB 0.0 MiB 16 values,
555 3912.4 MiB 0.0 MiB 16 log_probs,
556 )
557 3916.5 MiB 0.0 MiB 16 self._last_obs = new_obs # type: ignore[assignment]
558 3916.5 MiB 0.0 MiB 16 self._last_episode_starts = dones
559
560 3916.5 MiB 0.0 MiB 2 with th.no_grad():
561 # Compute value for the last timestep
562 3916.5 MiB 0.0 MiB 1 values = self.policy.predict_values(obs_as_tensor(new_obs, self.device)) # type: ignore[arg-type]
563
564 3916.5 MiB 0.0 MiB 1 rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones)
565
566 3916.5 MiB 0.0 MiB 1 callback.update_locals(locals())
567
568 3916.5 MiB 0.0 MiB 1 callback.on_rollout_end()
569
570 3916.5 MiB 0.0 MiB 1 return True
System Info
|---------------------------------------------------------------------------------------------|
| Driver Version: 545.23.08 | Graphics API: Vulkan
|=============================================================================================|
| GPU | Name | Active | LDA | GPU Memory | Vendor-ID | LUID |
| | | | | | Device-ID | UUID |
| | | | | | Bus-ID | |
|---------------------------------------------------------------------------------------------|
| 0 | NVIDIA GeForce RTX 3090 | Yes: 0 | | 24822 MB | 10de | 0 |
| | | | | | 2204 | a96dd751.. |
| | | | | | 2 | |
|---------------------------------------------------------------------------------------------|
| 1 | NVIDIA GeForce RTX 3090 | Yes: 1 | | 24822 MB | 10de | 0 |
| | | | | | 2204 | b36bae89.. |
| | | | | | 3 | |
|---------------------------------------------------------------------------------------------|
| 2 | NVIDIA GeForce RTX 3090 | Yes: 2 | | 24822 MB | 10de | 0 |
| | | | | | 2204 | 8f4d4115.. |
| | | | | | 82 | |
|---------------------------------------------------------------------------------------------|
| 3 | NVIDIA GeForce RTX 3090 | Yes: 3 | | 24822 MB | 10de | 0 |
| | | | | | 2204 | 106d5cb9.. |
| | | | | | 83 | |
|=============================================================================================|
| OS: 20.04.6 LTS (Focal Fossa) ubuntu, Version: 20.04.6, Kernel: 5.4.0-215-generic
| XServer Vendor: The X.Org Foundation, XServer Version: 12013000 (1.20.13.0)
| Processor: Intel(R) Xeon(R) CPU E5-2697 v4 @ 2.30GHz
| Cores: 36 | Logical Cores: 72
|---------------------------------------------------------------------------------------------|
and sb3 output
- OS: Linux-5.4.0-215-generic-x86_64-with-glibc2.31 # 235-Ubuntu SMP Fri Apr 11 21:55:32 UTC 2025
- Python: 3.10.16
- Stable-Baselines3: 2.6.0
- PyTorch: 2.5.1+cu124
- GPU Enabled: True
- Numpy: 1.26.0
- Cloudpickle: 3.1.1
- Gymnasium: 0.29.1
Checklist
- My issue does not relate to a custom gym environment. (Use the custom gym env template instead)
- I have checked that there is no similar issue in the repo
- I have read the documentation
- I have provided a minimal and working example to reproduce the bug
- I've used the markdown code blocks for both code and stack traces.
Metadata
Metadata
Assignees
Labels
custom gym envIssue related to Custom Gym EnvIssue related to Custom Gym Env