Skip to content

Commit f93dfc4

Browse files
authored
Merge branch 'PaddlePaddle:develop' into fake_dp_mode
2 parents b157f55 + e7007d1 commit f93dfc4

File tree

104 files changed

+1786
-339
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

104 files changed

+1786
-339
lines changed

.github/workflows/_Linux-XPU.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ jobs:
5959
CCACHE_DIR: /root/.ccache
6060
CCACHE_MAXSIZE: 150G
6161
CCACHE_LIMIT_MULTIPLE: 0.8
62-
IF_KUNLUN3: "ON"
62+
IF_KUNLUN3: "OFF"
6363
GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
6464
home_dir: ${{ github.workspace }}/../../../..
6565
run: |
@@ -216,7 +216,7 @@ jobs:
216216
CCACHE_DIR: /root/.ccache
217217
CCACHE_MAXSIZE: 150G
218218
CCACHE_LIMIT_MULTIPLE: 0.8
219-
IF_KUNLUN3: "ON"
219+
IF_KUNLUN3: "OFF"
220220
GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
221221
home_dir: ${{ github.workspace }}/../../../..
222222
FLAGS_use_stride_kernel: "0"

cmake/cinn/core.cmake

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,12 @@ function(cinn_cc_test TARGET_NAME)
7373
add_executable(${TARGET_NAME} ${cinn_cc_test_SRCS})
7474
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
7575
target_link_libraries(${TARGET_NAME} ${os_dependency_modules}
76-
cinn_gtest_main gtest glog ${cinn_cc_test_DEPS})
77-
add_dependencies(${TARGET_NAME} cinn_gtest_main gtest glog
76+
paddle_gtest_main gtest glog ${cinn_cc_test_DEPS})
77+
if(WITH_SHARED_PHI)
78+
target_link_libraries(${TARGET_NAME} -Wl,--as-needed phi_core phi_gpu
79+
-Wl,--no-as-needed)
80+
endif()
81+
add_dependencies(${TARGET_NAME} paddle_gtest_main gtest glog
7882
${cinn_cc_test_DEPS})
7983

8084
add_test(
@@ -159,13 +163,18 @@ function(cinn_nv_test TARGET_NAME)
159163
target_link_libraries(
160164
${TARGET_NAME}
161165
${cinn_nv_test_DEPS}
162-
cinn_gtest_main
166+
paddle_gtest_main
163167
gtest
164168
${os_dependency_modules}
165169
${CUDNN_LIBRARY}
166170
${CUBLAS_LIBRARIES}
167171
${CUDA_LIBRARIES})
168-
add_dependencies(${TARGET_NAME} ${cinn_nv_test_DEPS} cinn_gtest_main gtest)
172+
if(WITH_SHARED_PHI)
173+
target_link_libraries(${TARGET_NAME} -Wl,--as-needed phi_core phi_gpu
174+
-Wl,--no-as-needed)
175+
endif()
176+
add_dependencies(${TARGET_NAME} ${cinn_nv_test_DEPS} paddle_gtest_main
177+
gtest)
169178
common_link(${TARGET_NAME})
170179
add_test(
171180
NAME ${TARGET_NAME}

cmake/external/cub.cmake

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,16 @@ set(CUB_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cub)
2626

2727
if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6)
2828
# cuda_11.6/11.7/11.8‘s own cub is 1.15.0, which will cause compiling error in windows.
29-
set(CUB_TAG 1.16.0)
29+
set(CUB_TAG 2.1.0)
3030
execute_process(COMMAND git --git-dir=${CUB_SOURCE_DIR}/.git
3131
--work-tree=${CUB_SOURCE_DIR} checkout ${CUB_TAG})
32-
# cub 1.16.0 is not compatible with current thrust version
32+
# cub 2.1.0 is not compatible with current thrust version
3333
add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)
34+
if(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 11.8)
35+
set(cub_patches "${PADDLE_SOURCE_DIR}/patches/cub")
36+
message(STATUS "Add cub patches: ${cub_patches}")
37+
include_directories(${cub_patches})
38+
endif()
3439
else()
3540
set(CUB_TAG 1.8.0)
3641
endif()

cmake/generic.cmake

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -599,8 +599,9 @@ function(paddle_test_build TARGET_NAME)
599599
target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
600600
endif()
601601
if(WITH_CINN)
602-
target_link_libraries(${TARGET_NAME} $<TARGET_LINKER_FILE:cinnapi>
603-
cinn_transforms)
602+
target_link_libraries(${TARGET_NAME} -Wl,--as-needed cinnapi
603+
-Wl,--no-as-needed)
604+
target_link_libraries(${TARGET_NAME} cinn_transforms)
604605
add_dependencies(${TARGET_NAME} cinnapi)
605606
endif()
606607
if(WITH_XPU)

cmake/third_party.cmake

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,9 @@ if(WITH_ONNXRUNTIME)
482482
endif()
483483

484484
if(WITH_GPU)
485-
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
485+
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0
486+
OR (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.7
487+
AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.9))
486488
include(external/cub) # download cub
487489
list(APPEND third_party_deps extern_cub)
488490
elseif(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.0 AND WITH_SHARED_PHI)

paddle/fluid/distributed/collective/process_group_custom.cc

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
215215
return RunFnInXCCLEnv(
216216
[&](const phi::stream::Stream& stream) {
217217
auto comm_context = this->GetCommContext();
218-
comm_context->AllGather(out_tensor, in_tensor_maybe_partial, stream);
218+
comm_context->AllGather(
219+
out_tensor, in_tensor_maybe_partial, stream.raw_stream());
219220
},
220221
in_tensor_maybe_partial,
221222
CommType::ALLGATHER,
@@ -239,7 +240,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
239240
out_tensor,
240241
in_tensor,
241242
paddle::distributed::ToXCCLRedType(opts.reduce_op),
242-
stream);
243+
stream.raw_stream());
243244
},
244245
in_tensor,
245246
CommType::ALLREDUCE,
@@ -315,7 +316,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllToAll(
315316
rank_,
316317
size_,
317318
comm_context->GetXcclComm(),
318-
stream);
319+
stream.raw_stream());
319320
},
320321
in_tensor,
321322
CommType::ALLTOALL,
@@ -358,7 +359,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
358359
[&](const phi::stream::Stream& stream) {
359360
int root = opts.source_rank + opts.source_root;
360361
auto comm_context = this->GetCommContext();
361-
comm_context->Broadcast(out_tensor, in_tensor, root, stream);
362+
comm_context->Broadcast(
363+
out_tensor, in_tensor, root, stream.raw_stream());
362364
},
363365
in_tensor,
364366
CommType::BROADCAST,
@@ -382,7 +384,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Reduce(
382384
in_tensor,
383385
paddle::distributed::ToXCCLRedType(opts.reduce_op),
384386
opts.root_rank,
385-
stream);
387+
stream.raw_stream());
386388
},
387389
in_tensor,
388390
CommType::REDUCE,
@@ -406,7 +408,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::ReduceScatter(
406408
out_tensor,
407409
in_tensor,
408410
paddle::distributed::ToXCCLRedType(opts.reduce_op),
409-
stream);
411+
stream.raw_stream());
410412
},
411413
in_tensor,
412414
CommType::REDUCE_SCATTER,
@@ -441,7 +443,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Scatter(
441443
for (auto i = 0; i < size_; i++) {
442444
partial_tensor = GetPartialTensor(in_tensor, offset, numel);
443445
if (i != rank_) {
444-
comm_context->Send(partial_tensor, numel, i, stream);
446+
comm_context->Send(partial_tensor, numel, i, stream.raw_stream());
445447
} else {
446448
phi::DeviceManager::GetDeviceWithPlace(stream.GetPlace())
447449
->MemoryCopyD2D(out_tensor->data(),
@@ -452,7 +454,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Scatter(
452454
offset += numel;
453455
}
454456
} else {
455-
comm_context->Recv(out_tensor, numel, opts.root_rank, stream);
457+
comm_context->Recv(
458+
out_tensor, numel, opts.root_rank, stream.raw_stream());
456459
}
457460
},
458461
in_tensor,
@@ -506,7 +509,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Gather(
506509
for (auto i = 0; i < size_; i++) {
507510
auto& gather_tensor = gather_tensors[i];
508511
if (i != rank_) {
509-
comm_context->Recv(&gather_tensor, gather_tensor.numel(), i, stream);
512+
comm_context->Recv(
513+
&gather_tensor, gather_tensor.numel(), i, stream.raw_stream());
510514
} else {
511515
phi::DeviceManager::GetDeviceWithPlace(stream.GetPlace())
512516
->MemoryCopyD2D(
@@ -518,7 +522,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Gather(
518522
}
519523
} else {
520524
// send to root
521-
comm_context->Send(in_tensor, in_tensor.numel(), opts.root_rank, stream);
525+
comm_context->Send(
526+
in_tensor, in_tensor.numel(), opts.root_rank, stream.raw_stream());
522527
}
523528
};
524529
return RunFnInXCCLEnv(
@@ -542,7 +547,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Recv(
542547
return RunFnInXCCLEnv(
543548
[&](const phi::stream::Stream& stream) {
544549
auto comm_context = this->GetCommContext();
545-
comm_context->Recv(tensor, tensor->numel(), src_rank, stream);
550+
comm_context->Recv(
551+
tensor, tensor->numel(), src_rank, stream.raw_stream());
546552
},
547553
*tensor,
548554
CommType::RECV,
@@ -569,7 +575,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Send(
569575
comm_context->Send(tensor_maybe_partial,
570576
tensor_maybe_partial.numel(),
571577
dst_rank,
572-
stream);
578+
stream.raw_stream());
573579
},
574580
tensor_maybe_partial,
575581
CommType::SEND,
@@ -915,7 +921,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
915921
&output,
916922
input,
917923
paddle::distributed::ToXCCLRedType(opts.reduce_op),
918-
stream);
924+
stream.raw_stream());
919925
},
920926
CommType::ALLREDUCE);
921927
}
@@ -942,7 +948,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
942948
const auto root =
943949
opts.source_rank * in_tensors.size() + opts.source_root;
944950
auto comm_context = this->GetCommContext();
945-
comm_context->Broadcast(&output, input, root, stream);
951+
comm_context->Broadcast(&output, input, root, stream.raw_stream());
946952
},
947953
CommType::BROADCAST);
948954
}
@@ -988,7 +994,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Send(
988994
const phi::stream::Stream& stream,
989995
int dst_rank) {
990996
auto comm_context = this->GetCommContext();
991-
comm_context->Send(input, input.numel(), dst_rank, stream);
997+
comm_context->Send(input, input.numel(), dst_rank, stream.raw_stream());
992998
},
993999
dst_rank,
9941000
CommType::SEND);
@@ -1008,7 +1014,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Recv(
10081014
const phi::stream::Stream& stream,
10091015
int src_rank) {
10101016
auto comm_context = this->GetCommContext();
1011-
comm_context->Recv(&output, output.numel(), src_rank, stream);
1017+
comm_context->Recv(
1018+
&output, output.numel(), src_rank, stream.raw_stream());
10121019
},
10131020
src_rank,
10141021
CommType::RECV);
@@ -1037,7 +1044,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
10371044
const phi::ccl::CCLComm& comm,
10381045
const phi::stream::Stream& stream) {
10391046
auto comm_context = this->GetCommContext();
1040-
comm_context->AllGather(&output, input, stream);
1047+
comm_context->AllGather(&output, input, stream.raw_stream());
10411048
},
10421049
CommType::ALLGATHER);
10431050
}
@@ -1089,7 +1096,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllToAll(
10891096
rank_,
10901097
size_,
10911098
comm_context->GetXcclComm(),
1092-
stream);
1099+
stream.raw_stream());
10931100
},
10941101
CommType::ALLTOALL);
10951102
}
@@ -1166,7 +1173,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllToAll(
11661173
rank_,
11671174
size_,
11681175
comm_context->GetXcclComm(),
1169-
stream);
1176+
stream.raw_stream());
11701177
},
11711178
in_tensors,
11721179
CommType::ALLTOALL,
@@ -1197,7 +1204,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Reduce(
11971204
input,
11981205
paddle::distributed::ToXCCLRedType(opts.reduce_op),
11991206
opts.root_rank,
1200-
stream);
1207+
stream.raw_stream());
12011208
},
12021209
CommType::REDUCE);
12031210
}
@@ -1232,13 +1239,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Scatter(
12321239
for (auto i = 0; i < size_; i++) {
12331240
auto input_data = reinterpret_cast<phi::DenseTensor*>(
12341241
GetPointerByOffset(input.data(), offset, input.dtype()));
1235-
comm_context->Send(*input_data, count, i, stream);
1242+
comm_context->Send(*input_data, count, i, stream.raw_stream());
12361243
offset += count;
12371244
}
1238-
comm_context->Recv(&output, count, opts.root_rank, stream);
1245+
comm_context->Recv(
1246+
&output, count, opts.root_rank, stream.raw_stream());
12391247
comm_context->GroupEnd();
12401248
} else {
1241-
comm_context->Recv(&output, count, opts.root_rank, stream);
1249+
comm_context->Recv(
1250+
&output, count, opts.root_rank, stream.raw_stream());
12421251
}
12431252
},
12441253
CommType::SCATTER);

paddle/fluid/eager/backward.cc

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,30 @@ std::vector<paddle::Tensor> RunBackward(
192192
// Feed given tensor if it's provided
193193
VLOG(3) << "Fill grad input tensor " << i << "with give grad tensor";
194194

195-
// Deep copy
196-
node_input_buffers_dict[grad_node]->CopyValueFromTensor(
197-
input_info.first, input_info.second, grad_tensors[i]);
195+
bool use_shared_buffer = false;
196+
// Check if inputs and outputs are equal in size and share the same buffer
197+
if (tensors.size() == inputs.size() &&
198+
tensors[i].numel() == inputs[i].numel()) {
199+
auto output_tensor =
200+
std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
201+
auto input_tensor =
202+
std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
203+
use_shared_buffer = output_tensor->IsSharedBufferWith(*input_tensor);
204+
}
205+
206+
if (use_shared_buffer) {
207+
// Share buffer with given grad_tensor
208+
paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize>
209+
inputs_grad_tensors;
210+
inputs_grad_tensors.push_back({grad_tensors[i]});
211+
auto grad_holder = GradTensorHolder(std::move(inputs_grad_tensors));
212+
node_input_buffers_dict[grad_node] =
213+
std::make_unique<GradTensorHolder>(grad_holder);
214+
} else {
215+
// Deep copy
216+
node_input_buffers_dict[grad_node]->CopyValueFromTensor(
217+
input_info.first, input_info.second, grad_tensors[i]);
218+
}
198219
} else {
199220
VLOG(3) << "Fill grad input tensor " << i << " with 1.0";
200221
// Initialize tensor with 1.0

paddle/fluid/eager/custom_operator/custom_operator_node.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,10 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
373373
grad_node->SetAttrs(attrs);
374374
}
375375

376+
if (HasNodePostHook()) {
377+
outs = ApplyNodePostHooks(outs, hooked_grads);
378+
}
379+
376380
return outs;
377381
}
378382

@@ -459,6 +463,10 @@ RunCustomOpDoubleGradNode::operator()(
459463
outs[i] = ctx.OutputsBetween(output_pair.first, output_pair.second);
460464
}
461465

466+
if (HasNodePostHook()) {
467+
outs = ApplyNodePostHooks(outs, hooked_grads);
468+
}
469+
462470
return outs;
463471
}
464472
} // namespace egr

paddle/fluid/imperative/gradient_accumulator.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
221221
phi::DeviceContextPool::Instance().Get(place)); \
222222
phi::stream::Stream stream(place, ctx->stream()); \
223223
auto device = phi::DeviceManager::GetDeviceWithPlace(place); \
224-
device->BlasAXPBY<T>(stream, \
224+
device->BlasAXPBY<T>(stream.raw_stream(), \
225225
static_cast<size_t>(numel), \
226226
1., \
227227
src_tensor.data<T>(), \

0 commit comments

Comments
 (0)