-
Notifications
You must be signed in to change notification settings - Fork 5.8k
disable npp in multistream context #3338
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 4.x
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,8 +45,19 @@ | |
using namespace cv; | ||
using namespace cv::cuda; | ||
|
||
#define canUseContext NPP_VERSION >= (10 * 1000 + 1 * 100 + 0) | ||
#if canUseContext | ||
#define CTX_PREFIX _Ctx | ||
#else | ||
#define CTX_PREFIX | ||
#endif | ||
#define PPCAT_NX(A, B) A ## B | ||
#define PPCAT(A, B) PPCAT_NX(A, B) | ||
#define TRY_CTX(func) PPCAT(func, CTX_PREFIX) | ||
|
||
#if !defined HAVE_CUDA || defined(CUDA_DISABLER) | ||
|
||
|
||
void cv::cuda::warpAffine(InputArray, OutputArray, InputArray, Size, int, int, Scalar, Stream&) { throw_no_cuda(); } | ||
void cv::cuda::buildWarpAffineMaps(InputArray, bool, Size, OutputArray, OutputArray, Stream&) { throw_no_cuda(); } | ||
|
||
|
@@ -135,7 +146,17 @@ void cv::cuda::buildWarpPerspectiveMaps(InputArray _M, bool inverse, Size dsize, | |
|
||
namespace | ||
{ | ||
template <int DEPTH> struct NppWarpFunc | ||
template <int DEPTH, bool CanUseContext=true> struct NppWarpFunc | ||
{ | ||
typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type; | ||
|
||
typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_type* pDst, | ||
int dstStep, NppiRect dstRoi, const double coeffs[][3], | ||
int interpolation, NppStreamContext stream_ctx); | ||
}; | ||
|
||
template <int DEPTH> | ||
struct NppWarpFunc<DEPTH, false> | ||
{ | ||
typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type; | ||
|
||
|
@@ -144,11 +165,24 @@ namespace | |
int interpolation); | ||
}; | ||
|
||
template <int DEPTH, typename NppWarpFunc<DEPTH>::func_t func> struct NppWarp | ||
template <int DEPTH, typename NppWarpFunc<DEPTH, canUseContext>::func_t func> struct NppWarp | ||
{ | ||
typedef typename NppWarpFunc<DEPTH>::npp_type npp_type; | ||
|
||
typedef typename NppWarpFunc<DEPTH, canUseContext>::npp_type npp_type; | ||
|
||
static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream) | ||
{ | ||
call_impl(src, dst, coeffs, interpolation, stream, Int2Type<canUseContext>()); | ||
} | ||
|
||
template <int I> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use cudev::Int2Type? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure thing |
||
struct Int2Type | ||
{ | ||
enum { value = I }; | ||
}; | ||
|
||
// with context | ||
static void call_impl(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream, Int2Type<true>) | ||
{ | ||
static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; | ||
|
||
|
@@ -168,7 +202,40 @@ namespace | |
dstroi.height = dst.rows; | ||
dstroi.width = dst.cols; | ||
|
||
cv::cuda::NppStreamHandler h(stream); | ||
NppStatus nppStatus = NPP_SUCCESS; | ||
NppStreamContext nppStreamContext{}; | ||
nppStatus = nppGetStreamContext(&nppStreamContext); | ||
CV_Assert(NPP_SUCCESS == nppStatus); | ||
nppStreamContext.hStream = stream; | ||
|
||
nppSafeCall( func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi, | ||
dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi, | ||
coeffs, npp_inter[interpolation], nppStreamContext) ); | ||
|
||
if (stream == 0) | ||
cudaSafeCall( cudaDeviceSynchronize() ); | ||
} | ||
|
||
// without context | ||
static void call_impl(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream, Int2Type<false>) | ||
{ | ||
static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; | ||
|
||
NppiSize srcsz; | ||
srcsz.height = src.rows; | ||
srcsz.width = src.cols; | ||
|
||
NppiRect srcroi; | ||
srcroi.x = 0; | ||
srcroi.y = 0; | ||
srcroi.height = src.rows; | ||
srcroi.width = src.cols; | ||
|
||
NppiRect dstroi; | ||
dstroi.x = 0; | ||
dstroi.y = 0; | ||
dstroi.height = dst.rows; | ||
dstroi.width = dst.cols; | ||
|
||
nppSafeCall( func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi, | ||
dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi, | ||
|
@@ -193,7 +260,8 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size | |
CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC ); | ||
CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP ); | ||
|
||
_dst.create(dsize, src.type()); | ||
if (_dst.size() != dsize) | ||
_dst.create(dsize, src.type()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is redundant as void cv::cuda::GpuMat::create(int _rows, int _cols, int _type) should automatically handle this for you in addition to checking the type. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. far as I can remember, I checked the profiler and maybe there was some overhead? I am not sure... |
||
GpuMat dst = _dst.getGpuMat(); | ||
|
||
Size wholeSize; | ||
|
@@ -251,20 +319,20 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size | |
static const func_t funcs[2][6][4] = | ||
{ | ||
{ | ||
{NppWarp<CV_8U, nppiWarpAffine_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R>::call}, | ||
{NppWarp<CV_8U, TRY_CTX(nppiWarpAffine_8u_C1R)>::call, 0, NppWarp<CV_8U, TRY_CTX(nppiWarpAffine_8u_C3R)>::call, NppWarp<CV_8U, TRY_CTX(nppiWarpAffine_8u_C4R)>::call}, | ||
{0, 0, 0, 0}, | ||
{NppWarp<CV_16U, nppiWarpAffine_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffine_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffine_16u_C4R>::call}, | ||
{NppWarp<CV_16U, TRY_CTX(nppiWarpAffine_16u_C1R)>::call, 0, NppWarp<CV_16U, TRY_CTX(nppiWarpAffine_16u_C3R)>::call, NppWarp<CV_16U, TRY_CTX(nppiWarpAffine_16u_C4R)>::call}, | ||
{0, 0, 0, 0}, | ||
{NppWarp<CV_32S, nppiWarpAffine_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffine_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffine_32s_C4R>::call}, | ||
{NppWarp<CV_32F, nppiWarpAffine_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffine_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffine_32f_C4R>::call} | ||
{NppWarp<CV_32S, TRY_CTX(nppiWarpAffine_32s_C1R)>::call, 0, NppWarp<CV_32S, TRY_CTX(nppiWarpAffine_32s_C3R)>::call, NppWarp<CV_32S, TRY_CTX(nppiWarpAffine_32s_C4R)>::call}, | ||
{NppWarp<CV_32F, TRY_CTX(nppiWarpAffine_32f_C1R)>::call, 0, NppWarp<CV_32F, TRY_CTX(nppiWarpAffine_32f_C3R)>::call, NppWarp<CV_32F, TRY_CTX(nppiWarpAffine_32f_C4R)>::call} | ||
}, | ||
{ | ||
{NppWarp<CV_8U, nppiWarpAffineBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffineBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffineBack_8u_C4R>::call}, | ||
{NppWarp<CV_8U, TRY_CTX(nppiWarpAffineBack_8u_C1R)>::call, 0, NppWarp<CV_8U, TRY_CTX(nppiWarpAffineBack_8u_C3R)>::call, NppWarp<CV_8U, TRY_CTX(nppiWarpAffineBack_8u_C4R)>::call}, | ||
{0, 0, 0, 0}, | ||
{NppWarp<CV_16U, nppiWarpAffineBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffineBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffineBack_16u_C4R>::call}, | ||
{NppWarp<CV_16U, TRY_CTX(nppiWarpAffineBack_16u_C1R)>::call, 0, NppWarp<CV_16U, TRY_CTX(nppiWarpAffineBack_16u_C3R)>::call, NppWarp<CV_16U, TRY_CTX(nppiWarpAffineBack_16u_C4R)>::call}, | ||
{0, 0, 0, 0}, | ||
{NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R>::call}, | ||
{NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R>::call} | ||
{NppWarp<CV_32S, TRY_CTX(nppiWarpAffineBack_32s_C1R)>::call, 0, NppWarp<CV_32S, TRY_CTX(nppiWarpAffineBack_32s_C3R)>::call, NppWarp<CV_32S, TRY_CTX(nppiWarpAffineBack_32s_C4R)>::call}, | ||
{NppWarp<CV_32F, TRY_CTX(nppiWarpAffineBack_32f_C1R)>::call, 0, NppWarp<CV_32F, TRY_CTX(nppiWarpAffineBack_32f_C3R)>::call, NppWarp<CV_32F, TRY_CTX(nppiWarpAffineBack_32f_C4R)>::call} | ||
} | ||
}; | ||
|
||
|
@@ -390,20 +458,20 @@ void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, | |
static const func_t funcs[2][6][4] = | ||
{ | ||
{ | ||
{NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call}, | ||
{NppWarp<CV_8U, TRY_CTX(nppiWarpPerspective_8u_C1R)>::call, 0, NppWarp<CV_8U, TRY_CTX(nppiWarpPerspective_8u_C3R)>::call, NppWarp<CV_8U, TRY_CTX(nppiWarpPerspective_8u_C4R)>::call}, | ||
{0, 0, 0, 0}, | ||
{NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call}, | ||
{NppWarp<CV_16U, TRY_CTX(nppiWarpPerspective_16u_C1R)>::call, 0, NppWarp<CV_16U, TRY_CTX(nppiWarpPerspective_16u_C3R)>::call, NppWarp<CV_16U, TRY_CTX(nppiWarpPerspective_16u_C4R)>::call}, | ||
{0, 0, 0, 0}, | ||
{NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call}, | ||
{NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call} | ||
{NppWarp<CV_32S, TRY_CTX(nppiWarpPerspective_32s_C1R)>::call, 0, NppWarp<CV_32S, TRY_CTX(nppiWarpPerspective_32s_C3R)>::call, NppWarp<CV_32S, TRY_CTX(nppiWarpPerspective_32s_C4R)>::call}, | ||
{NppWarp<CV_32F, TRY_CTX(nppiWarpPerspective_32f_C1R)>::call, 0, NppWarp<CV_32F, TRY_CTX(nppiWarpPerspective_32f_C3R)>::call, NppWarp<CV_32F, TRY_CTX(nppiWarpPerspective_32f_C4R)>::call} | ||
}, | ||
{ | ||
{NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call}, | ||
{NppWarp<CV_8U, TRY_CTX(nppiWarpPerspectiveBack_8u_C1R)>::call, 0, NppWarp<CV_8U, TRY_CTX(nppiWarpPerspectiveBack_8u_C3R)>::call, NppWarp<CV_8U, TRY_CTX(nppiWarpPerspectiveBack_8u_C4R)>::call}, | ||
{0, 0, 0, 0}, | ||
{NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call}, | ||
{NppWarp<CV_16U, TRY_CTX(nppiWarpPerspectiveBack_16u_C1R)>::call, 0, NppWarp<CV_16U, TRY_CTX(nppiWarpPerspectiveBack_16u_C3R)>::call, NppWarp<CV_16U, TRY_CTX(nppiWarpPerspectiveBack_16u_C4R)>::call}, | ||
{0, 0, 0, 0}, | ||
{NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call}, | ||
{NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call} | ||
{NppWarp<CV_32S, TRY_CTX(nppiWarpPerspectiveBack_32s_C1R)>::call, 0, NppWarp<CV_32S, TRY_CTX(nppiWarpPerspectiveBack_32s_C3R)>::call, NppWarp<CV_32S, TRY_CTX(nppiWarpPerspectiveBack_32s_C4R)>::call}, | ||
{NppWarp<CV_32F, TRY_CTX(nppiWarpPerspectiveBack_32f_C1R)>::call, 0, NppWarp<CV_32F, TRY_CTX(nppiWarpPerspectiveBack_32f_C3R)>::call, NppWarp<CV_32F, TRY_CTX(nppiWarpPerspectiveBack_32f_C4R)>::call} | ||
} | ||
}; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please user uppercase for macros, e.g.
NPP_CONTEXT_AVAILABLE