feat: add swanlabcallback

Zeyi-Lin · Zeyi-Lin · commit 7388b60c46b3 · 2025-05-14T17:00:54.000+08:00
diff --git a/paddlenlp/trainer/integrations.py b/paddlenlp/trainer/integrations.py
@@ -42,6 +42,8 @@ def is_wandb_available():
         return False
     return importlib.util.find_spec("wandb") is not None
 
+def is_swanlab_available():
+    return importlib.util.find_spec("swanlab") is not None
 
 def is_ray_available():
     return importlib.util.find_spec("ray.air") is not None
@@ -55,6 +57,8 @@ def get_available_reporting_integrations():
         integrations.append("wandb")
     if is_tensorboardX_available():
         integrations.append("tensorboard")
+    if is_swanlab_available():
+        integrations.append("swanlab")
 
     return integrations
 
@@ -395,6 +399,90 @@ def on_save(self, args, state, control, **kwargs):
             self._wandb.log_artifact(artifact, aliases=[f"checkpoint-{state.global_step}"])
 
 
+class SwanLabCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that logs metrics, media to [Swanlab](https://swanlab.com/).
+    """
+
+    def __init__(self):
+        has_swanlab = is_swanlab_available()
+        if not has_swanlab:
+            raise RuntimeError("SwanlabCallback requires swanlab to be installed. Run `pip install swanlab`.")
+        if has_swanlab:
+            import swanlab
+            
+            self._swanlab = swanlab
+        
+        self._initialized = False
+    
+    def setup(self, args, state, model, **kwargs):
+        """
+        Setup the optional Swanlab integration.
+        
+        One can subclass and override this method to customize the setup if needed.
+        variables:
+        Environment:
+        - **SWANLAB_MODE** (`str`, *optional*, defaults to `"cloud"`):
+            Whether to use swanlab cloud, local or disabled. Set `SWANLAB_MODE="local"` to use local. Set `SWANLAB_MODE="disabled"` to disable.
+        - **SWANLAB_PROJECT** (`str`, *optional*, defaults to `"PaddleNLP"`):
+            Set this to a custom string to store results in a different project.
+        """
+        
+        if self._swanlab is None:
+            return
+        
+        if args.swanlab_api_key:
+            self._swanlab.login(api_key=args.swanlab_api_key)
+        
+        self._initialized = True
+        
+        if state.is_world_process_zero:
+            logger.info(
+                'Automatic Swanlab logging enabled, to disable set os.environ["SWANLAB_MODE"] = "disabled"'
+            )
+            
+            combined_dict = {**args.to_dict()}
+            
+            if hasattr(model, "config") and model.config is not None:
+                model_config = model.config.to_dict()
+                combined_dict = {**model_config, **combined_dict}
+            
+            trial_name = state.trial_name
+            init_args = {}
+            if trial_name is not None:
+                init_args["name"] = trial_name
+                init_args["group"] = args.run_name
+            else:
+                if not (args.run_name is None or args.run_name == args.output_dir):
+                    init_args["name"] = args.run_name
+            init_args["dir"] = args.logging_dir
+            if self._swanlab.run is None:
+                self._swanlab.init(
+                    project=os.getenv("SWANLAB_PROJECT", "PaddleNLP"),
+                    **init_args,
+                )
+            self._swanlab.config.update(combined_dict, allow_val_change=True)
+            
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        if self._swanlab is None:
+            return
+        if not self._initialized:
+            self.setup(args, state, model, **kwargs)
+    
+    def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwargs):
+        if self._swanlab is None:
+            return
+            
+    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
+        if self._swanlab is None:
+            return
+        if not self._initialized:
+            self.setup(args, state, model)
+        if state.is_world_process_zero:
+            logs = rewrite_logs(logs)
+            self._swanlab.log({**logs, "train/global_step": state.global_step}, step=state.global_step)
+            
+
 class AutoNLPCallback(TrainerCallback):
     """
     A [`TrainerCallback`] that sends the logs to [`Ray Tune`] for [`AutoNLP`]
@@ -423,6 +511,7 @@ def on_evaluate(self, args, state, control, **kwargs):
     "autonlp": AutoNLPCallback,
     "wandb": WandbCallback,
     "tensorboard": TensorBoardCallback,
+    "swanlab": SwanLabCallback,
 }
 
 
diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
@@ -376,7 +376,7 @@ class TrainingArguments:
             instance of `Dataset`.
         report_to (`str` or `List[str]`, *optional*, defaults to `"visualdl"`):
             The list of integrations to report the results and logs to.
-            Supported platforms are `"visualdl"`/`"wandb"`/`"tensorboard"`.
+            Supported platforms are `"visualdl"`/`"wandb"`/`"tensorboard"`/`"swanlab"`.
             `"none"` for no integrations.
         ddp_find_unused_parameters (`bool`, *optional*):
             When using distributed training, the value of the flag `find_unused_parameters` passed to
@@ -385,6 +385,8 @@ class TrainingArguments:
             Weights & Biases (WandB) API key(s) for authentication with the WandB service.
         wandb_http_proxy (`str`, *optional*):
             Weights & Biases (WandB) http proxy for connecting with the WandB service.
+        swanlab_api_key (`str`, *optional*):
+            Swanlab API key for authentication with the Swanlab service.
         resume_from_checkpoint (`str`, *optional*):
             The path to a folder with a valid checkpoint for your model. This argument is not directly used by
             [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See the [example
@@ -888,6 +890,10 @@ class TrainingArguments:
         default=None,
         metadata={"help": "Weights & Biases (WandB) http proxy for connecting with the WandB service."},
     )
+    swanlab_api_key: Optional[str] = field(
+        default=None,
+        metadata={"help": "Swanlab API key for authentication with the Swanlab service."},
+    )
     resume_from_checkpoint: Optional[str] = field(
         default=None,
         metadata={"help": "The path to a folder with a valid checkpoint for your model."},
diff --git a/tests/trainer/test_trainer_visualization.py b/tests/trainer/test_trainer_visualization.py
@@ -25,6 +25,7 @@
     TensorBoardCallback,
     VisualDLCallback,
     WandbCallback,
+    SwanLabCallback,
 )
 from tests.trainer.trainer_utils import RegressionModelConfig, RegressionPretrainedModel
 
@@ -65,6 +66,34 @@ def test_wandbcallback(self):
         os.environ.pop("WANDB_MODE", None)
         shutil.rmtree(output_dir)
 
+class TestSwanlabCallback(unittest.TestCase):
+    def test_swanlabcallback(self):
+        output_dir = tempfile.mkdtemp()
+        args = TrainingArguments(
+            output_dir=output_dir,
+            max_steps=200,
+            logging_steps=20,
+            run_name="test_swanlabcallback",
+            logging_dir=output_dir,
+        )
+        state = TrainerState(trial_name="PaddleNLP")
+        control = TrainerControl()
+        config = RegressionModelConfig(a=1, b=1)
+        model = RegressionPretrainedModel(config)
+        os.environ["SWANLAB_MODE"] = "disabled"
+        swanlabcallback = SwanLabCallback()
+        self.assertFalse(swanlabcallback._initialized)
+        swanlabcallback.on_train_begin(args, state, control)
+        self.assertTrue(swanlabcallback._initialized)
+        for global_step in range(args.max_steps):
+            state.global_step = global_step
+            if global_step % args.logging_steps == 0:
+                log = {"loss": 100 - 0.4 * global_step, "learning_rate": 0.1, "global_step": global_step}
+                swanlabcallback.on_log(args, state, control, logs=log)
+        swanlabcallback.on_train_end(args, state, control, model=model)
+        swanlabcallback._swanlab.finish()
+        os.environ.pop("SWANLAB_MODE", None)
+        shutil.rmtree(output_dir)
 
 class TestTensorboardCallback(unittest.TestCase):
     def test_tbcallback(self):