Skip to content

More data types supported in cv::cuda::transpose() #3371

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: 4.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 136 additions & 10 deletions modules/cudaarithm/src/cuda/transpose.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,34 +60,160 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
{
GpuMat src = getInputMat(_src, stream);

const int srcType = src.type();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this redundant?

const int srcDepth = src.depth();
const int srcCn = src.channels();
const size_t elemSize = src.elemSize();

CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
const size_t elemSize1 = src.elemSize1();

GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);

if (elemSize == 1)
const bool isNppiNativelySupported =
(srcType == CV_8UC1) || (srcType == CV_8UC3) || (srcType == CV_8UC4) ||
(srcType == CV_16UC1) || (srcType == CV_16UC3) || (srcType == CV_16UC4) ||
(srcType == CV_16SC1) || (srcType == CV_16SC3) || (srcType == CV_16SC4) ||
(srcType == CV_32SC1) || (srcType == CV_32SC3) || (srcType == CV_32SC4) ||
(srcType == CV_32FC1) || (srcType == CV_32FC3) || (srcType == CV_32FC4);
const bool isElemSizeSupportedByNppi =
(!(elemSize%1) && ((elemSize/1)<=4)) ||
(!(elemSize%2) && ((elemSize/2)<=4)) ||
(!(elemSize%4) && ((elemSize/4)<=4)) ||
(!(elemSize%8) && ((elemSize/8)<=2));
const bool isElemSizeSupportedByGridTranspose =
(elemSize == 1) || (elemSize == 2) || (elemSize == 4) || (elemSize == 8);
const bool isSupported = isNppiNativelySupported || isElemSizeSupportedByNppi || isElemSizeSupportedByGridTranspose;

if (!isSupported)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would CV_Assert() be better here?

CV_Error(Error::StsUnsupportedFormat, "");
else if (src.empty())
CV_Error(Error::StsBadArg,"image is empty");

if ((src.cols == 1) && (dst.cols == 1))
src.copyTo(dst, stream);
else if (((src.cols == 1) || (src.rows == 1)) && (src.cols*src.elemSize() == src.step))
src.reshape(0, src.cols).copyTo(dst, stream);
else if (isNppiNativelySupported)
{
NppStreamHandler h(StreamAccessor::getStream(stream));

NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;

nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
if (srcType == CV_8UC1)
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_8UC3)
nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_8UC4)
nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_16UC1)
nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_16UC3)
nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_16UC4)
nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_16SC1)
nppSafeCall( nppiTranspose_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step),
dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_16SC3)
nppSafeCall( nppiTranspose_16s_C3R(src.ptr<Npp16s>(), static_cast<int>(src.step),
dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_16SC4)
nppSafeCall( nppiTranspose_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step),
dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_32SC1)
nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step),
dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_32SC3)
nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step),
dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_32SC4)
nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_32FC1)
nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_32FC3)
nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
else if (srcType == CV_32FC4)
nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );

if (!stream)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
else if (elemSize == 4)
}//end if (isNppiNativelySupported)
else if (isElemSizeSupportedByNppi)
{
gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
}
else // if (elemSize == 8)
NppStreamHandler h(StreamAccessor::getStream(stream));

NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;

if (!(elemSize%1) && ((elemSize/1)==1))
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%1) && ((elemSize/1)==2))
nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%1) && ((elemSize/1)==3))
nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%1) && ((elemSize/1)==4))
nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%2) && ((elemSize/2)==1))
nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%2) && ((elemSize/2)==2))
nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%2) && ((elemSize/2)==3))
nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%2) && ((elemSize/2)==4))
nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%4) && ((elemSize/4)==1))
nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%4) && ((elemSize/4)==2))
nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%4) && ((elemSize/4)==3))
nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%4) && ((elemSize/4)==4))
nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%8) && ((elemSize/8)==1))
nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
else if (!(elemSize%8) && ((elemSize/8)==2))
nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );

if (!stream)
Copy link
Contributor

@cudawarped cudawarped Nov 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be applied to gridTranspose() as well, I can't understand why it was previously missing, surely the result could have been in flight on returns from this function when the default stream is passed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have the opposite question : why is it ever needed ? I let them here, but I don't understand the purpose of this extra synchronization when the default stream (sync by default) is used.

Copy link
Contributor

@cudawarped cudawarped Nov 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kernel launches are asynchronous with respect. The default stream syncs with other streams by default in legacy mode.

My interpretation is that the OpenCV API works on the assumption that if a stream isn't passed the user wants synchronization.

Copy link
Contributor Author

@chacha21 chacha21 Nov 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean that now that exist both cudaStreamPerThread and cudaStreamLegacy, people using OpenCV always expect the cudaStreamLegacy behaviour with the default stream, thus requiring the cudaDeviceSynchronize() ?
If so, I agree.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No users of the api will expect functions which they don't pass a stream to, to be synchronous with respect to the host when they return. This would not be the case with either cudaStreamPerThread or cudaStreamLegacy as the kernel launch is asynchronous in both cases.

In the example from the docs

1) k_1<<<1, 1, 0, s>>>();
2) k_2<<<1, 1>>>();
3) k_3<<<1, 1, 0, s>>>()
4) ...

k_2 waits on k_1 because k_2 is in the legacy default stream, then k_3 waits on the legacy stream. Because of the specific way this has been set up k_1 and k_2 have finished executing before the call to k_3<<<1, 1, 0, s>>>() however the result from k_3 may still be in flight after control has returned to the host when you reach line 4.

Now I haven't used per thread default streams (I always use explicit streams) but my understanding is that if the CUDA_API_PER_THREAD_DEFAULT_STREAM macro was used to enable per thread default streams k_1 would run before k_3 but both would be asynchronous with respect to k_2. Either way when control returns to the host on line 4 they may still all be in flight.

On the other hand if we have the following which is our case if no stream is passed

1) k_1<<<1, 1>>>();
2) k_2<<<1, 1>>>();
3) k_3<<<1, 1>>>();
4) 

in either case cudaStreamPerThread or cudaStreamLegacy all three kernels may still be in flight when control returns to the host on line 4 if we don't explicitly syncronize.

Copy link
Contributor Author

@chacha21 chacha21 Nov 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, anyway, I just checked and gridTranspose() does ultimately call cudaDeviceSynchronize()for null stream (see https://github.com/opencv/opencv_contrib/blob/4.x/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cudawarped Do you have open issues with the PR. I want to merge, if you do not mind.

Copy link
Contributor

@cudawarped cudawarped Dec 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@asmorkalov I am not 100% convinced that isNppiNativelySupported and isElemSizeSupportedByNppi flags are needed. I can understand that @chacha21 wants a fall back option in case something goes wrong and because of that wants to seperate out the two logic paths but I am not sure if the redundant calls under isElemSizeSupportedByNppi make the function harder to maintain, what do you think?

Additionally elemSize1 is unused, the isElemSizeSupportedByNppi logic path won't be fully tested and due to the names of the nppi functions it may make more sense to just use the bit size instead of examining elemSize, i.e.

      else if (!(elemSize%2) && ((elemSize/2)==2))
        nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );

could be

      else if (elemSize==4)
        nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whatever the decision, I will certainly agree : I explained my initial code structure, but I am OK to adapt for a more "OpenCV style"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Conditions like if (!(elemSize%2) && ((elemSize/2)==2)) looks very cryptic. I understand the logic behind it, but if (elemSize==4) definitely more readable.
  2. The only reason for fallback I see is 2 channel matrix. All other cases are handled with regular types. I propose to merge "native" support and just "support" in single case and use relevant Npp call. No duplicated/dead branches, mor obvious testing.

CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}//end if (isElemSizeSupportedByNppi)
else if (isElemSizeSupportedByGridTranspose)
{
if (elemSize == 1)
gridTranspose(globPtr<unsigned char>(src), globPtr<unsigned char>(dst), stream);
else if (elemSize == 2)
gridTranspose(globPtr<unsigned short>(src), globPtr<unsigned short>(dst), stream);
else if (elemSize == 4)
gridTranspose(globPtr<signed int>(src), globPtr<signed int>(dst), stream);
else if (elemSize == 8)
gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
}
}//end if (isElemSizeSupportedByGridTranspose)

syncOutput(dst, _dst, stream);
}
Expand Down
21 changes: 20 additions & 1 deletion modules/cudaarithm/test/test_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,31 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES,
testing::Values(MatType(CV_8UC1),
MatType(CV_8UC2),
MatType(CV_8UC3),
MatType(CV_8UC4),
MatType(CV_8SC1),
MatType(CV_8SC2),
MatType(CV_8SC3),
MatType(CV_8SC4),
MatType(CV_16UC1),
MatType(CV_16UC2),
MatType(CV_16UC3),
MatType(CV_16UC4),
MatType(CV_16SC1),
MatType(CV_16SC2),
MatType(CV_16SC3),
MatType(CV_16SC4),
MatType(CV_32SC1),
MatType(CV_32SC2),
MatType(CV_64FC1)),
MatType(CV_32SC3),
MatType(CV_32SC4),
MatType(CV_32FC1),
MatType(CV_32FC2),
MatType(CV_32FC3),
MatType(CV_32FC4),
MatType(CV_64FC1),
MatType(CV_64FC2)),
WHOLE_SUBMAT));

////////////////////////////////////////////////////////////////////////////////
Expand Down