-
Notifications
You must be signed in to change notification settings - Fork 5.8k
More data types supported in cv::cuda::transpose() #3371
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 4.x
Are you sure you want to change the base?
Changes from 3 commits
51180e9
be2dd8d
9e3eb3d
00abef3
e5b152e
4f0e470
43a1691
29cb16a
3672694
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,34 +60,160 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) | |
{ | ||
GpuMat src = getInputMat(_src, stream); | ||
|
||
const int srcType = src.type(); | ||
const int srcDepth = src.depth(); | ||
const int srcCn = src.channels(); | ||
const size_t elemSize = src.elemSize(); | ||
|
||
CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 ); | ||
const size_t elemSize1 = src.elemSize1(); | ||
|
||
GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream); | ||
|
||
if (elemSize == 1) | ||
const bool isNppiNativelySupported = | ||
(srcType == CV_8UC1) || (srcType == CV_8UC3) || (srcType == CV_8UC4) || | ||
(srcType == CV_16UC1) || (srcType == CV_16UC3) || (srcType == CV_16UC4) || | ||
(srcType == CV_16SC1) || (srcType == CV_16SC3) || (srcType == CV_16SC4) || | ||
(srcType == CV_32SC1) || (srcType == CV_32SC3) || (srcType == CV_32SC4) || | ||
(srcType == CV_32FC1) || (srcType == CV_32FC3) || (srcType == CV_32FC4); | ||
const bool isElemSizeSupportedByNppi = | ||
(!(elemSize%1) && ((elemSize/1)<=4)) || | ||
(!(elemSize%2) && ((elemSize/2)<=4)) || | ||
(!(elemSize%4) && ((elemSize/4)<=4)) || | ||
(!(elemSize%8) && ((elemSize/8)<=2)); | ||
const bool isElemSizeSupportedByGridTranspose = | ||
(elemSize == 1) || (elemSize == 2) || (elemSize == 4) || (elemSize == 8); | ||
const bool isSupported = isNppiNativelySupported || isElemSizeSupportedByNppi || isElemSizeSupportedByGridTranspose; | ||
|
||
if (!isSupported) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would CV_Assert() be better here? |
||
CV_Error(Error::StsUnsupportedFormat, ""); | ||
else if (src.empty()) | ||
CV_Error(Error::StsBadArg,"image is empty"); | ||
|
||
if ((src.cols == 1) && (dst.cols == 1)) | ||
src.copyTo(dst, stream); | ||
else if (((src.cols == 1) || (src.rows == 1)) && (src.cols*src.elemSize() == src.step)) | ||
src.reshape(0, src.cols).copyTo(dst, stream); | ||
else if (isNppiNativelySupported) | ||
{ | ||
NppStreamHandler h(StreamAccessor::getStream(stream)); | ||
|
||
NppiSize sz; | ||
sz.width = src.cols; | ||
sz.height = src.rows; | ||
|
||
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), | ||
if (srcType == CV_8UC1) | ||
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_8UC3) | ||
nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_8UC4) | ||
nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_16UC1) | ||
nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_16UC3) | ||
nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_16UC4) | ||
nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_16SC1) | ||
nppSafeCall( nppiTranspose_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_16SC3) | ||
nppSafeCall( nppiTranspose_16s_C3R(src.ptr<Npp16s>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_16SC4) | ||
nppSafeCall( nppiTranspose_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_32SC1) | ||
nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), | ||
dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_32SC3) | ||
nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step), | ||
dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_32SC4) | ||
nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step), | ||
dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_32FC1) | ||
nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), | ||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_32FC3) | ||
nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step), | ||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) ); | ||
else if (srcType == CV_32FC4) | ||
nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step), | ||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) ); | ||
|
||
if (!stream) | ||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); | ||
} | ||
else if (elemSize == 4) | ||
}//end if (isNppiNativelySupported) | ||
else if (isElemSizeSupportedByNppi) | ||
{ | ||
gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream); | ||
} | ||
else // if (elemSize == 8) | ||
NppStreamHandler h(StreamAccessor::getStream(stream)); | ||
|
||
NppiSize sz; | ||
sz.width = src.cols; | ||
sz.height = src.rows; | ||
|
||
if (!(elemSize%1) && ((elemSize/1)==1)) | ||
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%1) && ((elemSize/1)==2)) | ||
nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%1) && ((elemSize/1)==3)) | ||
nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%1) && ((elemSize/1)==4)) | ||
nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%2) && ((elemSize/2)==1)) | ||
nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%2) && ((elemSize/2)==2)) | ||
nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%2) && ((elemSize/2)==3)) | ||
nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%2) && ((elemSize/2)==4)) | ||
nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%4) && ((elemSize/4)==1)) | ||
nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), | ||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%4) && ((elemSize/4)==2)) | ||
nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%4) && ((elemSize/4)==3)) | ||
nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step), | ||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%4) && ((elemSize/4)==4)) | ||
nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step), | ||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%8) && ((elemSize/8)==1)) | ||
nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step), | ||
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) ); | ||
else if (!(elemSize%8) && ((elemSize/8)==2)) | ||
nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step), | ||
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) ); | ||
|
||
if (!stream) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be applied to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have the opposite question : why is it ever needed ? I let them here, but I don't understand the purpose of this extra synchronization when the default stream (sync by default) is used. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kernel launches are asynchronous with respect. The default stream syncs with other streams by default in legacy mode. My interpretation is that the OpenCV API works on the assumption that if a stream isn't passed the user wants synchronization. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean that now that exist both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No users of the api will expect functions which they don't pass a stream to, to be synchronous with respect to the host when they return. This would not be the case with either In the example from the docs
k_2 waits on k_1 because k_2 is in the legacy default stream, then k_3 waits on the legacy stream. Because of the specific way this has been set up k_1 and k_2 have finished executing before the call to Now I haven't used per thread default streams (I always use explicit streams) but my understanding is that if the CUDA_API_PER_THREAD_DEFAULT_STREAM macro was used to enable per thread default streams k_1 would run before k_3 but both would be asynchronous with respect to k_2. Either way when control returns to the host on line 4 they may still all be in flight. On the other hand if we have the following which is our case if no stream is passed
in either case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, anyway, I just checked and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cudawarped Do you have open issues with the PR. I want to merge, if you do not mind. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @asmorkalov I am not 100% convinced that Additionally
could be
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whatever the decision, I will certainly agree : I explained my initial code structure, but I am OK to adapt for a more "OpenCV style" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); | ||
}//end if (isElemSizeSupportedByNppi) | ||
else if (isElemSizeSupportedByGridTranspose) | ||
{ | ||
if (elemSize == 1) | ||
gridTranspose(globPtr<unsigned char>(src), globPtr<unsigned char>(dst), stream); | ||
else if (elemSize == 2) | ||
gridTranspose(globPtr<unsigned short>(src), globPtr<unsigned short>(dst), stream); | ||
else if (elemSize == 4) | ||
gridTranspose(globPtr<signed int>(src), globPtr<signed int>(dst), stream); | ||
else if (elemSize == 8) | ||
gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream); | ||
} | ||
}//end if (isElemSizeSupportedByGridTranspose) | ||
|
||
syncOutput(dst, _dst, stream); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this redundant?