[AutoParallel] Using local_map replace LocalLayer (#10309)

zty-king · web-flow · commit c3e25ca52718 · 2025-05-23T15:01:59.000+08:00
diff --git a/paddlenlp/transformers/llama/modeling_auto.py b/paddlenlp/transformers/llama/modeling_auto.py
@@ -1178,26 +1178,24 @@ def forward(self, prediction_scores, masked_lm_labels):
                     masked_lm_labels.unsqueeze(2),
                 )
 
-            # XPU dose not support allgather mask with bool dtype, so we use LocalLayer here.
+            # XPU dose not support allgather mask with bool dtype, so we use local_map here.
             if get_env_device() == "xpu":
 
-                class LocalLossLayer(paddle.distributed.LocalLayer):
-                    def __init__(self, out_dist_attrs, grad_dist_attrs):
-                        super().__init__(out_dist_attrs, grad_dist_attrs)
-
-                    def forward(self, x, mask):
-                        masked_lm_loss = paddle.masked_select(x, mask).astype("float32")
-                        loss = paddle.mean(masked_lm_loss).unsqueeze(0)
-                        return loss.unsqueeze(0)
+                def coculate_loss(x, mask):
+                    masked_lm_loss = paddle.masked_select(x, mask).astype("float32")
+                    loss = paddle.mean(masked_lm_loss).unsqueeze(0)
+                    return loss.unsqueeze(0)
 
                 out_dist_attrs = [
-                    (masked_lm_loss.process_mesh, [dist.Shard(0), dist.Replicate()]),
+                    [dist.Shard(0), dist.Replicate()],
                 ]
                 grad_dist_attrs = [
-                    (masked_lm_loss.process_mesh, [dist.Shard(0), dist.Replicate()]),
+                    [dist.Shard(0), dist.Replicate()],
                     None,
                 ]
-                loss_func = LocalLossLayer(out_dist_attrs, grad_dist_attrs)
+                loss_func = dist.local_map(
+                    coculate_loss, out_dist_attrs, grad_dist_attrs, masked_lm_loss.process_mesh, reshard_inputs=True
+                )
 
                 loss = loss_func(masked_lm_loss, masked_lm_loss > 0)
                 loss = loss.mean()