Skip to content

Unsuccessful client socket initialization #4

@nunuisacat

Description

@nunuisacat

Hi there,
thanks for the nice paper and code.
I am trying to run the code on one single GPU with one node.
And I am currently encountering this error:

(TaskRunner pid=331451) [<ray.util.placement_group.PlacementGroup object at 0x7f7944090290>]
(TaskRunner pid=331451) WARNING:2025-06-05 16:31:57,528:Waiting for register center actor 3aSkv3_register_center to be ready. Elapsed time: 0 seconds out of 300 seconds.
(WorkerDict pid=336431) [W605 16:32:03.287362855 socket.cpp:759] [c10d] The client socket cannot be initialized to connect to [kube-proxy-10.194.0.22.magiccube-prom-stack-kube-proxy.default.svc.cluster.local]:34319 (errno: 97 - Address family not supported by protocol).

Here is my training parameters:, Do you have any idea where the problem might lie at? Thanks

{
    "version": "0.2.0",
    "configurations": [
      {
        "name": "torchrun verl trainer (main_ppo)",
        "type": "python",
        "request": "launch",
        //"program": "/opt/conda/envs/v_triune/bin/torchrun",
        "program":"${workspaceFolder}/verl/trainer/main_ppo.py",
        //"program":"-m",
        "args": [
            //"--nproc_per_node=1",
            //"--master_port=29500",
            //"verl/trainer/main_ppo.py",
            //"verl.trainer.main_ppo",
            "data.train_files=[/Orsta-Data-47k/train/train_detection_v3det_4000.parquet]",
            "data.test_files=[]",
            "data.train_batch_size=1",
            "data.test_batch_size=1",
            "data.max_prompt_length=8192",
            "data.max_response_length=1024",
            "data.filter_overlong_prompts=False",
            "data.truncation=error",
            "data.image_key=images",
            "data.shuffle=True",
            "data.num_examine_train=0",
            "data.num_examine_test=0",
            "actor_rollout_ref.model.path=/dahuafs/groupdata/Cameraalgorithm/vlm/Qwen/Qwen2.5-VL-3B-Instruct",
            "actor_rollout_ref.model.enable_gradient_checkpointing=True",
            "actor_rollout_ref.model.use_remove_padding=True",
            "actor_rollout_ref.model.use_liger=False",
            "actor_rollout_ref.actor.optim.lr=1e-6",
            "actor_rollout_ref.actor.optim.lr_vit=1e-6",
            "actor_rollout_ref.actor.optim.lr_connector=1e-6",
            "actor_rollout_ref.actor.optim.lr_llm=1e-6",
            "actor_rollout_ref.actor.optim.lr_freeze=[vit,connector]",
            "actor_rollout_ref.actor.ppo_mini_batch_size=1",
            "actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1",
            "actor_rollout_ref.actor.ppo_max_token_len_per_gpu=1024",
            "actor_rollout_ref.actor.clip_ratio=0.2",
            "actor_rollout_ref.actor.clip_ratio_low=0.2",
            "actor_rollout_ref.actor.clip_ratio_high=0.28",
            "actor_rollout_ref.actor.loss_agg_mode=token-mean",
            "actor_rollout_ref.actor.use_kl_loss=True",
            "actor_rollout_ref.actor.use_torch_compile=True",
            "actor_rollout_ref.actor.kl_loss_coef=0.001",
            "actor_rollout_ref.actor.kl_loss_type=mse",
            "actor_rollout_ref.actor.entropy_coeff=0.000",
            "actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.05",
            "actor_rollout_ref.actor.optim.warmup_style=constant",
            "actor_rollout_ref.actor.fsdp_config.param_offload=False",
            "actor_rollout_ref.actor.fsdp_config.optimizer_offload=False",
            "actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1",
            "actor_rollout_ref.rollout.tensor_model_parallel_size=1",
            "actor_rollout_ref.rollout.name=vllm",
            "actor_rollout_ref.rollout.gpu_memory_utilization=0.7",
            "actor_rollout_ref.rollout.temperature=1.0",
            "actor_rollout_ref.rollout.enable_chunked_prefill=False",
            "actor_rollout_ref.rollout.enforce_eager=False",
            "actor_rollout_ref.rollout.max_num_batched_tokens=10240",
            "actor_rollout_ref.rollout.free_cache_engine=False",
            "actor_rollout_ref.rollout.n=1",
            "actor_rollout_ref.rollout.engine_kwargs.swap_space=16",
            "actor_rollout_ref.rollout.val_kwargs.temperature=0",
            "actor_rollout_ref.rollout.val_kwargs.top_p=1",
            "actor_rollout_ref.rollout.val_kwargs.do_sample=False",
            "actor_rollout_ref.rollout.limit_images=1",
            "actor_rollout_ref.rollout.limit_videos=0",
            "actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1",
            "actor_rollout_ref.ref.fsdp_config.param_offload=False",
            "algorithm.adv_estimator=grpo",
            "algorithm.use_kl_in_reward=False",
            "algorithm.kl_ctrl.kl_coef=0.000",
            "trainer.critic_warmup=0",
            //"trainer.logger=['console','wandb']",
            "trainer.project_name=v_triune",
            "trainer.experiment_name=v_triune",
            "trainer.n_gpus_per_node=1",
            "trainer.nnodes=1",
            "trainer.default_local_dir=/verl_exp/v_triune",
            "trainer.save_freq=100",
            "trainer.test_freq=-5",
            "trainer.total_epochs=3",
            "trainer.resume_mode=auto",
            "trainer.val_before_train=False",
            "reward_model.reward_manager=remote",
            "+reward_model.remote_reward_job_id=j-7wfuoekanm"
            //"+reward_model.remote_reward_job_id=8192"

        ],
        "console": "integratedTerminal",
        "justMyCode": false,
        "env": {
            "TORCH_DISTRIBUTED_USE_V4": "1",
            "CUDA_VISIBLE_DEVICES": "0",
            "NUM_NODES": "1",
            "GPUS_PER_NODE": "1",
            "EXP_NAME": "v_triune",
            "REMOTE_REWARD_JOB_ID":"j-7wfuoekanm",
            //"REMOTE_REWARD_JOB_ID": "j-10.194.6.29:8192",
            //"REMOTE_REWARD_JOB_ID": "8192",
            "DATA_TRAIN_FILE": "[/dahuafs/groupdata/Cameraalgorithm/vlm/Orsta-Data-47k/train/train_detection_v3det_4000.parquet]",
            "ACTOR_CLIP_RATIO": "0.2",
            "ACTOR_CLIP_RATIO_HIGH": "0.28",
            "ACTOR_CLIP_RATIO_LOW": "0.2",
            "ACTOR_ENTROPY_COEFF": "0.000",
            "ACTOR_KL_LOSS_COEFF": "0.001",
            "ACTOR_USE_KL_LOSS": "True",
            "ACTOR_KL_LOSS_TYPE": "mse",
            "ACTOR_LOSS_AGG_MODE": "token-mean",
            "ENABLE_DUAL_CLIP": "True",
            "ACTOR_USE_LIGER": "False",
            "ACTOR_LOAD_PATH": "/dahuafs/groupdata/Cameraalgorithm/vlm/Qwen/Qwen2.5-VL-3B-Instruct",
            "TRAIN_SAVE_FREQ": "100",
            "TRAIN_SAVE_PATH": "/verl_exp",
            "ACTOR_FSDP_OMT_OFFLOAD": "False",
            "ACTOR_FSDP_PARAM_OFFLOAD": "False",
            "ACTOR_PPO_GLOBAL_BSZ": "1",
            "ACTOR_PPO_MICRO_BSZ": "1",
            "ACTOR_PPO_MAX_TOKEN_LEN_PER_GPU": "1024",
            "ALGO_ADV_ESTIMATOR": "grpo",
            "ALGO_KL_COEF": "0.000",
            "LOG_P_MICRO_BSZ": "1",
            "DATA_TRAIN_BATCH_SIZE": "1",
            "DATA_TEST_BATCH_SIZE": "1",
            "DATA_MAX_RES_LENGTH": "1024",
            "DATA_FILTER_OVERLONG_PROMPTS": "False",
            "DATA_IMAGE_KEYWORD": "images",
            "DATA_MAX_PROMPT_LENGTH": "8192",
            "DATA_SHUFFLE": "True",
            "DATA_NUM_EXAMINE_TRAIN": "0",
            "DATA_NUM_EXAMINE_TEST": "0",
            "ACTOR_LR": "1e-6",
            "ACTOR_LR_FREEZE": "[vit,connector]",
            "ACTOR_LR_VIT": "1e-6",
            "ACTOR_LR_CONNECTOR": "1e-6",
            "ACTOR_LR_LLM": "1e-6",
            "WARMUP_STYLE": "constant",
            "LR_WARMUP_STEPS_RATIO": "0.05",
            "ROLLOUT_CHUNKED_PREFILL": "False",
            "ROLLOUT_FREE_CACHE_ENFORCE_EAGER": "False",
            "ROLLOUT_MAX_GPU_MEM": "0.2",
            "ROLLOUT_MAX_NUM_BATCHED_TOKENS": "10240",
            "ROLLOUT_N": "1",
            "ROLLOUT_SWAP_SPACE": "16",
            "ROLLOUT_TEMP": "1.0",
            "ROLLOUT_TP_SIZE": "1",
            "ROLLOUT_IMAGE_LIMIT": "1",
            "ROLLOUT_VIDEO_LIMIT": "0",
            "EVAL_BEFORE_TRAIN": "False",
            "EVAL_DO_SAMPLE": "False",
            "EVAL_TEMP": "0",
            "EVAL_TOPP": "1",
            "TRAIN_PROJECT_NAME": "v_triune",
            "TRAIN_TEST_FREQ": "-5",
            "TRAIN_TOTAL_EPOCHS": "3",
            //"WANDB_API_KEY": "your wandb api key"
        }
      }
    ]
  }

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions