PaddlePaddle
diff --git a/‎.github/workflows/_Linux-XPU.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/_Linux-XPU.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmake/cinn/core.cmake
Lines changed: 13 additions & 4 deletions b/‎cmake/cinn/core.cmake
Lines changed: 13 additions & 4 deletions
diff --git a/‎cmake/external/cub.cmake
Lines changed: 7 additions & 2 deletions b/‎cmake/external/cub.cmake
Lines changed: 7 additions & 2 deletions
diff --git a/‎cmake/generic.cmake
Lines changed: 3 additions & 2 deletions b/‎cmake/generic.cmake
Lines changed: 3 additions & 2 deletions
diff --git a/‎cmake/third_party.cmake
Lines changed: 3 additions & 1 deletion b/‎cmake/third_party.cmake
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/fluid/distributed/collective/process_group_custom.cc
Lines changed: 32 additions & 23 deletions b/‎paddle/fluid/distributed/collective/process_group_custom.cc
Lines changed: 32 additions & 23 deletions
diff --git a/‎paddle/fluid/eager/backward.cc
Lines changed: 24 additions & 3 deletions b/‎paddle/fluid/eager/backward.cc
Lines changed: 24 additions & 3 deletions
diff --git a/‎paddle/fluid/eager/custom_operator/custom_operator_node.cc
Lines changed: 8 additions & 0 deletions b/‎paddle/fluid/eager/custom_operator/custom_operator_node.cc
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddle/fluid/imperative/gradient_accumulator.cc
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/imperative/gradient_accumulator.cc
Lines changed: 1 addition & 1 deletion
@@ -59,7 +59,7 @@ jobs:
           CCACHE_DIR: /root/.ccache
           CCACHE_MAXSIZE: 150G
           CCACHE_LIMIT_MULTIPLE: 0.8
-          IF_KUNLUN3: "ON"
+          IF_KUNLUN3: "OFF"
           GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           home_dir: ${{ github.workspace }}/../../../..
         run: |
@@ -216,7 +216,7 @@ jobs:
           CCACHE_DIR: /root/.ccache
           CCACHE_MAXSIZE: 150G
           CCACHE_LIMIT_MULTIPLE: 0.8
-          IF_KUNLUN3: "ON"
+          IF_KUNLUN3: "OFF"
           GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           home_dir: ${{ github.workspace }}/../../../..
           FLAGS_use_stride_kernel: "0"
 
@@ -73,8 +73,12 @@ function(cinn_cc_test TARGET_NAME)
     add_executable(${TARGET_NAME} ${cinn_cc_test_SRCS})
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
     target_link_libraries(${TARGET_NAME} ${os_dependency_modules}
-                          cinn_gtest_main gtest glog ${cinn_cc_test_DEPS})
-    add_dependencies(${TARGET_NAME} cinn_gtest_main gtest glog
+                          paddle_gtest_main gtest glog ${cinn_cc_test_DEPS})
+    if(WITH_SHARED_PHI)
+      target_link_libraries(${TARGET_NAME} -Wl,--as-needed phi_core phi_gpu
+                            -Wl,--no-as-needed)
+    endif()
+    add_dependencies(${TARGET_NAME} paddle_gtest_main gtest glog
                      ${cinn_cc_test_DEPS})
 
     add_test(
@@ -159,13 +163,18 @@ function(cinn_nv_test TARGET_NAME)
     target_link_libraries(
       ${TARGET_NAME}
       ${cinn_nv_test_DEPS}
-      cinn_gtest_main
+      paddle_gtest_main
       gtest
       ${os_dependency_modules}
       ${CUDNN_LIBRARY}
       ${CUBLAS_LIBRARIES}
       ${CUDA_LIBRARIES})
-    add_dependencies(${TARGET_NAME} ${cinn_nv_test_DEPS} cinn_gtest_main gtest)
+    if(WITH_SHARED_PHI)
+      target_link_libraries(${TARGET_NAME} -Wl,--as-needed phi_core phi_gpu
+                            -Wl,--no-as-needed)
+    endif()
+    add_dependencies(${TARGET_NAME} ${cinn_nv_test_DEPS} paddle_gtest_main
+                     gtest)
     common_link(${TARGET_NAME})
     add_test(
       NAME ${TARGET_NAME}
 
@@ -26,11 +26,16 @@ set(CUB_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cub)
 
 if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6)
   # cuda_11.6/11.7/11.8‘s own cub is 1.15.0, which will cause compiling error in windows.
-  set(CUB_TAG 1.16.0)
+  set(CUB_TAG 2.1.0)
   execute_process(COMMAND git --git-dir=${CUB_SOURCE_DIR}/.git
                           --work-tree=${CUB_SOURCE_DIR} checkout ${CUB_TAG})
-  # cub 1.16.0 is not compatible with current thrust version
+  # cub 2.1.0 is not compatible with current thrust version
   add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)
+  if(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 11.8)
+    set(cub_patches "${PADDLE_SOURCE_DIR}/patches/cub")
+    message(STATUS "Add cub patches: ${cub_patches}")
+    include_directories(${cub_patches})
+  endif()
 else()
   set(CUB_TAG 1.8.0)
 endif()
 
@@ -599,8 +599,9 @@ function(paddle_test_build TARGET_NAME)
       target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
     endif()
     if(WITH_CINN)
-      target_link_libraries(${TARGET_NAME} $<TARGET_LINKER_FILE:cinnapi>
-                            cinn_transforms)
+      target_link_libraries(${TARGET_NAME} -Wl,--as-needed cinnapi
+                            -Wl,--no-as-needed)
+      target_link_libraries(${TARGET_NAME} cinn_transforms)
       add_dependencies(${TARGET_NAME} cinnapi)
     endif()
     if(WITH_XPU)
 
@@ -482,7 +482,9 @@ if(WITH_ONNXRUNTIME)
 endif()
 
 if(WITH_GPU)
-  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0
+     OR (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.7
+         AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.9))
     include(external/cub) # download cub
     list(APPEND third_party_deps extern_cub)
   elseif(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.0 AND WITH_SHARED_PHI)
 
@@ -215,7 +215,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
   return RunFnInXCCLEnv(
       [&](const phi::stream::Stream& stream) {
         auto comm_context = this->GetCommContext();
-        comm_context->AllGather(out_tensor, in_tensor_maybe_partial, stream);
+        comm_context->AllGather(
+            out_tensor, in_tensor_maybe_partial, stream.raw_stream());
       },
       in_tensor_maybe_partial,
       CommType::ALLGATHER,
@@ -239,7 +240,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
             out_tensor,
             in_tensor,
             paddle::distributed::ToXCCLRedType(opts.reduce_op),
-            stream);
+            stream.raw_stream());
       },
       in_tensor,
       CommType::ALLREDUCE,
@@ -315,7 +316,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllToAll(
             rank_,
             size_,
             comm_context->GetXcclComm(),
-            stream);
+            stream.raw_stream());
       },
       in_tensor,
       CommType::ALLTOALL,
@@ -358,7 +359,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
       [&](const phi::stream::Stream& stream) {
         int root = opts.source_rank + opts.source_root;
         auto comm_context = this->GetCommContext();
-        comm_context->Broadcast(out_tensor, in_tensor, root, stream);
+        comm_context->Broadcast(
+            out_tensor, in_tensor, root, stream.raw_stream());
       },
       in_tensor,
       CommType::BROADCAST,
@@ -382,7 +384,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Reduce(
                              in_tensor,
                              paddle::distributed::ToXCCLRedType(opts.reduce_op),
                              opts.root_rank,
-                             stream);
+                             stream.raw_stream());
       },
       in_tensor,
       CommType::REDUCE,
@@ -406,7 +408,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::ReduceScatter(
             out_tensor,
             in_tensor,
             paddle::distributed::ToXCCLRedType(opts.reduce_op),
-            stream);
+            stream.raw_stream());
       },
       in_tensor,
       CommType::REDUCE_SCATTER,
@@ -441,7 +443,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Scatter(
           for (auto i = 0; i < size_; i++) {
             partial_tensor = GetPartialTensor(in_tensor, offset, numel);
             if (i != rank_) {
-              comm_context->Send(partial_tensor, numel, i, stream);
+              comm_context->Send(partial_tensor, numel, i, stream.raw_stream());
             } else {
               phi::DeviceManager::GetDeviceWithPlace(stream.GetPlace())
                   ->MemoryCopyD2D(out_tensor->data(),
@@ -452,7 +454,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Scatter(
             offset += numel;
           }
         } else {
-          comm_context->Recv(out_tensor, numel, opts.root_rank, stream);
+          comm_context->Recv(
+              out_tensor, numel, opts.root_rank, stream.raw_stream());
         }
       },
       in_tensor,
@@ -506,7 +509,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Gather(
       for (auto i = 0; i < size_; i++) {
         auto& gather_tensor = gather_tensors[i];
         if (i != rank_) {
-          comm_context->Recv(&gather_tensor, gather_tensor.numel(), i, stream);
+          comm_context->Recv(
+              &gather_tensor, gather_tensor.numel(), i, stream.raw_stream());
         } else {
           phi::DeviceManager::GetDeviceWithPlace(stream.GetPlace())
               ->MemoryCopyD2D(
@@ -518,7 +522,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Gather(
       }
     } else {
       // send to root
-      comm_context->Send(in_tensor, in_tensor.numel(), opts.root_rank, stream);
+      comm_context->Send(
+          in_tensor, in_tensor.numel(), opts.root_rank, stream.raw_stream());
     }
   };
   return RunFnInXCCLEnv(
@@ -542,7 +547,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Recv(
   return RunFnInXCCLEnv(
       [&](const phi::stream::Stream& stream) {
         auto comm_context = this->GetCommContext();
-        comm_context->Recv(tensor, tensor->numel(), src_rank, stream);
+        comm_context->Recv(
+            tensor, tensor->numel(), src_rank, stream.raw_stream());
       },
       *tensor,
       CommType::RECV,
@@ -569,7 +575,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Send(
         comm_context->Send(tensor_maybe_partial,
                            tensor_maybe_partial.numel(),
                            dst_rank,
-                           stream);
+                           stream.raw_stream());
       },
       tensor_maybe_partial,
       CommType::SEND,
@@ -915,7 +921,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
             &output,
             input,
             paddle::distributed::ToXCCLRedType(opts.reduce_op),
-            stream);
+            stream.raw_stream());
       },
       CommType::ALLREDUCE);
 }
@@ -942,7 +948,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
         const auto root =
             opts.source_rank * in_tensors.size() + opts.source_root;
         auto comm_context = this->GetCommContext();
-        comm_context->Broadcast(&output, input, root, stream);
+        comm_context->Broadcast(&output, input, root, stream.raw_stream());
       },
       CommType::BROADCAST);
 }
@@ -988,7 +994,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Send(
           const phi::stream::Stream& stream,
           int dst_rank) {
         auto comm_context = this->GetCommContext();
-        comm_context->Send(input, input.numel(), dst_rank, stream);
+        comm_context->Send(input, input.numel(), dst_rank, stream.raw_stream());
       },
       dst_rank,
       CommType::SEND);
@@ -1008,7 +1014,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Recv(
           const phi::stream::Stream& stream,
           int src_rank) {
         auto comm_context = this->GetCommContext();
-        comm_context->Recv(&output, output.numel(), src_rank, stream);
+        comm_context->Recv(
+            &output, output.numel(), src_rank, stream.raw_stream());
       },
       src_rank,
       CommType::RECV);
@@ -1037,7 +1044,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
           const phi::ccl::CCLComm& comm,
           const phi::stream::Stream& stream) {
         auto comm_context = this->GetCommContext();
-        comm_context->AllGather(&output, input, stream);
+        comm_context->AllGather(&output, input, stream.raw_stream());
       },
       CommType::ALLGATHER);
 }
@@ -1089,7 +1096,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllToAll(
             rank_,
             size_,
             comm_context->GetXcclComm(),
-            stream);
+            stream.raw_stream());
       },
       CommType::ALLTOALL);
 }
@@ -1166,7 +1173,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllToAll(
                                         rank_,
                                         size_,
                                         comm_context->GetXcclComm(),
-                                        stream);
+                                        stream.raw_stream());
       },
       in_tensors,
       CommType::ALLTOALL,
@@ -1197,7 +1204,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Reduce(
                              input,
                              paddle::distributed::ToXCCLRedType(opts.reduce_op),
                              opts.root_rank,
-                             stream);
+                             stream.raw_stream());
       },
       CommType::REDUCE);
 }
@@ -1232,13 +1239,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Scatter(
           for (auto i = 0; i < size_; i++) {
             auto input_data = reinterpret_cast<phi::DenseTensor*>(
                 GetPointerByOffset(input.data(), offset, input.dtype()));
-            comm_context->Send(*input_data, count, i, stream);
+            comm_context->Send(*input_data, count, i, stream.raw_stream());
             offset += count;
           }
-          comm_context->Recv(&output, count, opts.root_rank, stream);
+          comm_context->Recv(
+              &output, count, opts.root_rank, stream.raw_stream());
           comm_context->GroupEnd();
         } else {
-          comm_context->Recv(&output, count, opts.root_rank, stream);
+          comm_context->Recv(
+              &output, count, opts.root_rank, stream.raw_stream());
         }
       },
       CommType::SCATTER);
 
@@ -192,9 +192,30 @@ std::vector<paddle::Tensor> RunBackward(
       // Feed given tensor if it's provided
       VLOG(3) << "Fill grad input tensor " << i << "with give grad tensor";
 
-      // Deep copy
-      node_input_buffers_dict[grad_node]->CopyValueFromTensor(
-          input_info.first, input_info.second, grad_tensors[i]);
+      bool use_shared_buffer = false;
+      // Check if inputs and outputs are equal in size and share the same buffer
+      if (tensors.size() == inputs.size() &&
+          tensors[i].numel() == inputs[i].numel()) {
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
+        use_shared_buffer = output_tensor->IsSharedBufferWith(*input_tensor);
+      }
+
+      if (use_shared_buffer) {
+        // Share buffer with given grad_tensor
+        paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize>
+            inputs_grad_tensors;
+        inputs_grad_tensors.push_back({grad_tensors[i]});
+        auto grad_holder = GradTensorHolder(std::move(inputs_grad_tensors));
+        node_input_buffers_dict[grad_node] =
+            std::make_unique<GradTensorHolder>(grad_holder);
+      } else {
+        // Deep copy
+        node_input_buffers_dict[grad_node]->CopyValueFromTensor(
+            input_info.first, input_info.second, grad_tensors[i]);
+      }
     } else {
       VLOG(3) << "Fill grad input tensor " << i << " with 1.0";
       // Initialize tensor with 1.0
 
@@ -373,6 +373,10 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
     grad_node->SetAttrs(attrs);
   }
 
+  if (HasNodePostHook()) {
+    outs = ApplyNodePostHooks(outs, hooked_grads);
+  }
+
   return outs;
 }
 
@@ -459,6 +463,10 @@ RunCustomOpDoubleGradNode::operator()(
     outs[i] = ctx.OutputsBetween(output_pair.first, output_pair.second);
   }
 
+  if (HasNodePostHook()) {
+    outs = ApplyNodePostHooks(outs, hooked_grads);
+  }
+
   return outs;
 }
 }  // namespace egr
@@ -221,7 +221,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
         phi::DeviceContextPool::Instance().Get(place));          \
     phi::stream::Stream stream(place, ctx->stream());            \
     auto device = phi::DeviceManager::GetDeviceWithPlace(place); \
-    device->BlasAXPBY<T>(stream,                                 \
+    device->BlasAXPBY<T>(stream.raw_stream(),                    \
                          static_cast<size_t>(numel),             \
                          1.,                                     \
                          src_tensor.data<T>(),                   \
Original file line number	Diff line number	Diff line change
`@@ -373,6 +373,10 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,`
`373`	`373`	`grad_node->SetAttrs(attrs);`
`374`	`374`	`}`
`375`	`375`
	`376`	`+ if (HasNodePostHook()) {`
	`377`	`+ outs = ApplyNodePostHooks(outs, hooked_grads);`
	`378`	`+ }`
	`379`	`+`
`376`	`380`	`return outs;`
`377`	`381`	`}`
`378`	`382`
`@@ -459,6 +463,10 @@ RunCustomOpDoubleGradNode::operator()(`
`459`	`463`	`outs[i] = ctx.OutputsBetween(output_pair.first, output_pair.second);`
`460`	`464`	`}`
`461`	`465`
	`466`	`+ if (HasNodePostHook()) {`
	`467`	`+ outs = ApplyNodePostHooks(outs, hooked_grads);`
	`468`	`+ }`
	`469`	`+`
`462`	`470`	`return outs;`
`463`	`471`	`}`
`464`	`472`	`} // namespace egr`