-
Notifications
You must be signed in to change notification settings - Fork 16
Open
Description
Hi there,
thanks for the nice paper and code.
I am trying to run the code on one single GPU with one node.
And I am currently encountering this error:
(TaskRunner pid=331451) [<ray.util.placement_group.PlacementGroup object at 0x7f7944090290>]
(TaskRunner pid=331451) WARNING:2025-06-05 16:31:57,528:Waiting for register center actor 3aSkv3_register_center to be ready. Elapsed time: 0 seconds out of 300 seconds.
(WorkerDict pid=336431) [W605 16:32:03.287362855 socket.cpp:759] [c10d] The client socket cannot be initialized to connect to [kube-proxy-10.194.0.22.magiccube-prom-stack-kube-proxy.default.svc.cluster.local]:34319 (errno: 97 - Address family not supported by protocol).
Here is my training parameters:, Do you have any idea where the problem might lie at? Thanks
{
"version": "0.2.0",
"configurations": [
{
"name": "torchrun verl trainer (main_ppo)",
"type": "python",
"request": "launch",
//"program": "/opt/conda/envs/v_triune/bin/torchrun",
"program":"${workspaceFolder}/verl/trainer/main_ppo.py",
//"program":"-m",
"args": [
//"--nproc_per_node=1",
//"--master_port=29500",
//"verl/trainer/main_ppo.py",
//"verl.trainer.main_ppo",
"data.train_files=[/Orsta-Data-47k/train/train_detection_v3det_4000.parquet]",
"data.test_files=[]",
"data.train_batch_size=1",
"data.test_batch_size=1",
"data.max_prompt_length=8192",
"data.max_response_length=1024",
"data.filter_overlong_prompts=False",
"data.truncation=error",
"data.image_key=images",
"data.shuffle=True",
"data.num_examine_train=0",
"data.num_examine_test=0",
"actor_rollout_ref.model.path=/dahuafs/groupdata/Cameraalgorithm/vlm/Qwen/Qwen2.5-VL-3B-Instruct",
"actor_rollout_ref.model.enable_gradient_checkpointing=True",
"actor_rollout_ref.model.use_remove_padding=True",
"actor_rollout_ref.model.use_liger=False",
"actor_rollout_ref.actor.optim.lr=1e-6",
"actor_rollout_ref.actor.optim.lr_vit=1e-6",
"actor_rollout_ref.actor.optim.lr_connector=1e-6",
"actor_rollout_ref.actor.optim.lr_llm=1e-6",
"actor_rollout_ref.actor.optim.lr_freeze=[vit,connector]",
"actor_rollout_ref.actor.ppo_mini_batch_size=1",
"actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1",
"actor_rollout_ref.actor.ppo_max_token_len_per_gpu=1024",
"actor_rollout_ref.actor.clip_ratio=0.2",
"actor_rollout_ref.actor.clip_ratio_low=0.2",
"actor_rollout_ref.actor.clip_ratio_high=0.28",
"actor_rollout_ref.actor.loss_agg_mode=token-mean",
"actor_rollout_ref.actor.use_kl_loss=True",
"actor_rollout_ref.actor.use_torch_compile=True",
"actor_rollout_ref.actor.kl_loss_coef=0.001",
"actor_rollout_ref.actor.kl_loss_type=mse",
"actor_rollout_ref.actor.entropy_coeff=0.000",
"actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.05",
"actor_rollout_ref.actor.optim.warmup_style=constant",
"actor_rollout_ref.actor.fsdp_config.param_offload=False",
"actor_rollout_ref.actor.fsdp_config.optimizer_offload=False",
"actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1",
"actor_rollout_ref.rollout.tensor_model_parallel_size=1",
"actor_rollout_ref.rollout.name=vllm",
"actor_rollout_ref.rollout.gpu_memory_utilization=0.7",
"actor_rollout_ref.rollout.temperature=1.0",
"actor_rollout_ref.rollout.enable_chunked_prefill=False",
"actor_rollout_ref.rollout.enforce_eager=False",
"actor_rollout_ref.rollout.max_num_batched_tokens=10240",
"actor_rollout_ref.rollout.free_cache_engine=False",
"actor_rollout_ref.rollout.n=1",
"actor_rollout_ref.rollout.engine_kwargs.swap_space=16",
"actor_rollout_ref.rollout.val_kwargs.temperature=0",
"actor_rollout_ref.rollout.val_kwargs.top_p=1",
"actor_rollout_ref.rollout.val_kwargs.do_sample=False",
"actor_rollout_ref.rollout.limit_images=1",
"actor_rollout_ref.rollout.limit_videos=0",
"actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1",
"actor_rollout_ref.ref.fsdp_config.param_offload=False",
"algorithm.adv_estimator=grpo",
"algorithm.use_kl_in_reward=False",
"algorithm.kl_ctrl.kl_coef=0.000",
"trainer.critic_warmup=0",
//"trainer.logger=['console','wandb']",
"trainer.project_name=v_triune",
"trainer.experiment_name=v_triune",
"trainer.n_gpus_per_node=1",
"trainer.nnodes=1",
"trainer.default_local_dir=/verl_exp/v_triune",
"trainer.save_freq=100",
"trainer.test_freq=-5",
"trainer.total_epochs=3",
"trainer.resume_mode=auto",
"trainer.val_before_train=False",
"reward_model.reward_manager=remote",
"+reward_model.remote_reward_job_id=j-7wfuoekanm"
//"+reward_model.remote_reward_job_id=8192"
],
"console": "integratedTerminal",
"justMyCode": false,
"env": {
"TORCH_DISTRIBUTED_USE_V4": "1",
"CUDA_VISIBLE_DEVICES": "0",
"NUM_NODES": "1",
"GPUS_PER_NODE": "1",
"EXP_NAME": "v_triune",
"REMOTE_REWARD_JOB_ID":"j-7wfuoekanm",
//"REMOTE_REWARD_JOB_ID": "j-10.194.6.29:8192",
//"REMOTE_REWARD_JOB_ID": "8192",
"DATA_TRAIN_FILE": "[/dahuafs/groupdata/Cameraalgorithm/vlm/Orsta-Data-47k/train/train_detection_v3det_4000.parquet]",
"ACTOR_CLIP_RATIO": "0.2",
"ACTOR_CLIP_RATIO_HIGH": "0.28",
"ACTOR_CLIP_RATIO_LOW": "0.2",
"ACTOR_ENTROPY_COEFF": "0.000",
"ACTOR_KL_LOSS_COEFF": "0.001",
"ACTOR_USE_KL_LOSS": "True",
"ACTOR_KL_LOSS_TYPE": "mse",
"ACTOR_LOSS_AGG_MODE": "token-mean",
"ENABLE_DUAL_CLIP": "True",
"ACTOR_USE_LIGER": "False",
"ACTOR_LOAD_PATH": "/dahuafs/groupdata/Cameraalgorithm/vlm/Qwen/Qwen2.5-VL-3B-Instruct",
"TRAIN_SAVE_FREQ": "100",
"TRAIN_SAVE_PATH": "/verl_exp",
"ACTOR_FSDP_OMT_OFFLOAD": "False",
"ACTOR_FSDP_PARAM_OFFLOAD": "False",
"ACTOR_PPO_GLOBAL_BSZ": "1",
"ACTOR_PPO_MICRO_BSZ": "1",
"ACTOR_PPO_MAX_TOKEN_LEN_PER_GPU": "1024",
"ALGO_ADV_ESTIMATOR": "grpo",
"ALGO_KL_COEF": "0.000",
"LOG_P_MICRO_BSZ": "1",
"DATA_TRAIN_BATCH_SIZE": "1",
"DATA_TEST_BATCH_SIZE": "1",
"DATA_MAX_RES_LENGTH": "1024",
"DATA_FILTER_OVERLONG_PROMPTS": "False",
"DATA_IMAGE_KEYWORD": "images",
"DATA_MAX_PROMPT_LENGTH": "8192",
"DATA_SHUFFLE": "True",
"DATA_NUM_EXAMINE_TRAIN": "0",
"DATA_NUM_EXAMINE_TEST": "0",
"ACTOR_LR": "1e-6",
"ACTOR_LR_FREEZE": "[vit,connector]",
"ACTOR_LR_VIT": "1e-6",
"ACTOR_LR_CONNECTOR": "1e-6",
"ACTOR_LR_LLM": "1e-6",
"WARMUP_STYLE": "constant",
"LR_WARMUP_STEPS_RATIO": "0.05",
"ROLLOUT_CHUNKED_PREFILL": "False",
"ROLLOUT_FREE_CACHE_ENFORCE_EAGER": "False",
"ROLLOUT_MAX_GPU_MEM": "0.2",
"ROLLOUT_MAX_NUM_BATCHED_TOKENS": "10240",
"ROLLOUT_N": "1",
"ROLLOUT_SWAP_SPACE": "16",
"ROLLOUT_TEMP": "1.0",
"ROLLOUT_TP_SIZE": "1",
"ROLLOUT_IMAGE_LIMIT": "1",
"ROLLOUT_VIDEO_LIMIT": "0",
"EVAL_BEFORE_TRAIN": "False",
"EVAL_DO_SAMPLE": "False",
"EVAL_TEMP": "0",
"EVAL_TOPP": "1",
"TRAIN_PROJECT_NAME": "v_triune",
"TRAIN_TEST_FREQ": "-5",
"TRAIN_TOTAL_EPOCHS": "3",
//"WANDB_API_KEY": "your wandb api key"
}
}
]
}
Metadata
Metadata
Assignees
Labels
No labels