opencv · chacha21 · Nov 9, 2022 · Nov 10, 2022 · Nov 10, 2022 · Nov 11, 2022
diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
@@ -60,34 +60,202 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
 {
     GpuMat src = getInputMat(_src, stream);
 
+    const int srcType = src.type();
+    const int srcDepth = src.depth();
+    const int srcCn = src.channels();
     const size_t elemSize = src.elemSize();
-
-    CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
+    const size_t elemSize1 = src.elemSize1();
 
     GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
 
-    if (elemSize == 1)
-    {
-        NppStreamHandler h(StreamAccessor::getStream(stream));
+    const bool isSupported =
+      (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4) ||
+      (elemSize == 6) || (elemSize == 8) || (elemSize == 12) || (elemSize == 16);
 
+    if (!isSupported)
+        CV_Error(Error::StsUnsupportedFormat, "");
+    else if (src.empty())
+        CV_Error(Error::StsBadArg,"image is empty");
+
+    if ((src.rows == 1) && (src.cols == 1))
+        src.copyTo(dst, stream);
+    else if (src.rows == 1)
+        src.reshape(0, src.cols).copyTo(dst, stream);
+    else if ((src.cols == 1) && src.isContinuous())
+        src.reshape(0, src.cols).copyTo(dst, stream);
+    else
+    {
         NppiSize sz;
         sz.width  = src.cols;
         sz.height = src.rows;
 
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+        if (!stream)
+        {
+          //native implementation
+          if (srcType == CV_8UC1)
+            nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_8UC3)
+            nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_8UC4)
+            nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16UC1)
+            nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16UC3)
+            nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16UC4)
+            nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16SC1)
+            nppSafeCall( nppiTranspose_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16SC3)
+            nppSafeCall( nppiTranspose_16s_C3R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16SC4)
+            nppSafeCall( nppiTranspose_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32SC1)
+            nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32SC3)
+            nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32SC4)
+            nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32FC1)
+            nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32FC3)
+            nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32FC4)
+            nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+          //reinterpretation
+          else if (elemSize == 1)
+            nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 2)
+            nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 3)
+            nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 4)
+            nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 6)
+            nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 8)
+            nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 12)
+            nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 16)
+            nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+        }//end if (!stream)
+        else//if (stream != 0)
+        {
+          NppStreamContext ctx;
+          nppSafeCall( nppGetStreamContext(&ctx) );
+          ctx.hStream = StreamAccessor::getStream(stream);
+
+          //native implementation
+          if (srcType == CV_8UC1)
+            nppSafeCall( nppiTranspose_8u_C1R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_8UC3)
+            nppSafeCall( nppiTranspose_8u_C3R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_8UC4)
+            nppSafeCall( nppiTranspose_8u_C4R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16UC1)
+            nppSafeCall( nppiTranspose_16u_C1R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16UC3)
+            nppSafeCall( nppiTranspose_16u_C3R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16UC4)
+            nppSafeCall( nppiTranspose_16u_C4R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16SC1)
+            nppSafeCall( nppiTranspose_16s_C1R_Ctx(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16SC3)
+            nppSafeCall( nppiTranspose_16s_C3R_Ctx(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16SC4)
+            nppSafeCall( nppiTranspose_16s_C4R_Ctx(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32SC1)
+            nppSafeCall( nppiTranspose_32s_C1R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32SC3)
+            nppSafeCall( nppiTranspose_32s_C3R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32SC4)
+            nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32FC1)
+            nppSafeCall( nppiTranspose_32f_C1R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32FC3)
+            nppSafeCall( nppiTranspose_32f_C3R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32FC4)
+            nppSafeCall( nppiTranspose_32f_C4R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ctx) );
+          //reinterpretation
+          else if (elemSize == 1)
+            nppSafeCall( nppiTranspose_8u_C1R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 2)
+            nppSafeCall( nppiTranspose_16u_C1R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 3)
+            nppSafeCall( nppiTranspose_8u_C3R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 4)
+            nppSafeCall( nppiTranspose_32s_C1R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 6)
+            nppSafeCall( nppiTranspose_16u_C3R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 8)
+            nppSafeCall( nppiTranspose_16u_C4R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 12)
+            nppSafeCall( nppiTranspose_32s_C3R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 16)
+            nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+        }//end if (stream != 0)
 
         if (!stream)
-            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
-    }
+          CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }//end if
+
+    /*
+    if (elemSize == 1)
+      gridTranspose(globPtr<unsigned char>(src), globPtr<unsigned char>(dst), stream);
+    else if (elemSize == 2)
+      gridTranspose(globPtr<unsigned short>(src), globPtr<unsigned short>(dst), stream);
     else if (elemSize == 4)
-    {
-        gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
-    }
-    else // if (elemSize == 8)
-    {
-        gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
-    }
+      gridTranspose(globPtr<signed int>(src), globPtr<signed int>(dst), stream);
+    else if (elemSize == 8)
+      gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
+    */
 
     syncOutput(dst, _dst, stream);
 }

diff --git a/modules/cudaarithm/test/test_core.cpp b/modules/cudaarithm/test/test_core.cpp
@@ -231,12 +231,31 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC2),
+                    MatType(CV_8UC3),
                     MatType(CV_8UC4),
+                    MatType(CV_8SC1),
+                    MatType(CV_8SC2),
+                    MatType(CV_8SC3),
+                    MatType(CV_8SC4),
+                    MatType(CV_16UC1),
                     MatType(CV_16UC2),
+                    MatType(CV_16UC3),
+                    MatType(CV_16UC4),
+                    MatType(CV_16SC1),
                     MatType(CV_16SC2),
+                    MatType(CV_16SC3),
+                    MatType(CV_16SC4),
                     MatType(CV_32SC1),
                     MatType(CV_32SC2),
-                    MatType(CV_64FC1)),
+                    MatType(CV_32SC3),
+                    MatType(CV_32SC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC2),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4),
+                    MatType(CV_64FC1),
+                    MatType(CV_64FC2)),
     WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////