opencv · chacha21 · Nov 9, 2022 · Nov 10, 2022 · Nov 10, 2022 · Nov 11, 2022
diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
@@ -60,34 +60,160 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
 {
     GpuMat src = getInputMat(_src, stream);
 
+    const int srcType = src.type();
+    const int srcDepth = src.depth();
+    const int srcCn = src.channels();
     const size_t elemSize = src.elemSize();
-
-    CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
+    const size_t elemSize1 = src.elemSize1();
 
     GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
 
-    if (elemSize == 1)
+    const bool isNppiNativelySupported =
+      (srcType == CV_8UC1)  || (srcType == CV_8UC3)  || (srcType == CV_8UC4)  ||
+      (srcType == CV_16UC1) || (srcType == CV_16UC3) || (srcType == CV_16UC4) ||
+      (srcType == CV_16SC1) || (srcType == CV_16SC3) || (srcType == CV_16SC4) ||
+      (srcType == CV_32SC1) || (srcType == CV_32SC3) || (srcType == CV_32SC4) ||
+      (srcType == CV_32FC1) || (srcType == CV_32FC3) || (srcType == CV_32FC4);
+    const bool isElemSizeSupportedByNppi =
+      (!(elemSize%1) && ((elemSize/1)<=4)) ||
+      (!(elemSize%2) && ((elemSize/2)<=4)) ||
+      (!(elemSize%4) && ((elemSize/4)<=4)) ||
+      (!(elemSize%8) && ((elemSize/8)<=2));
+    const bool isElemSizeSupportedByGridTranspose =
+      (elemSize == 1) || (elemSize == 2) || (elemSize == 4) || (elemSize == 8);
+    const bool isSupported = isNppiNativelySupported || isElemSizeSupportedByNppi || isElemSizeSupportedByGridTranspose;
+
+    if (!isSupported)
+      CV_Error(Error::StsUnsupportedFormat, "");
+    else if (src.empty())
+      CV_Error(Error::StsBadArg,"image is empty");
+
+    if ((src.cols == 1) && (dst.cols == 1))
+      src.copyTo(dst, stream);
+    else if (((src.cols == 1) || (src.rows == 1)) && (src.cols*src.elemSize() == src.step))
+      src.reshape(0, src.cols).copyTo(dst, stream);
+    else if (isNppiNativelySupported)
     {
         NppStreamHandler h(StreamAccessor::getStream(stream));
 
         NppiSize sz;
         sz.width  = src.cols;
         sz.height = src.rows;
 
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+        if (srcType == CV_8UC1)
+          nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_8UC3)
+          nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_8UC4)
+          nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
             dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16UC1)
+          nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16UC3)
+          nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16UC4)
+          nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16SC1)
+          nppSafeCall( nppiTranspose_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+            dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16SC3)
+          nppSafeCall( nppiTranspose_16s_C3R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+            dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16SC4)
+          nppSafeCall( nppiTranspose_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+            dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32SC1)
+          nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32SC3)
+          nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32SC4)
+          nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32FC1)
+          nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32FC3)
+          nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32FC4)
+          nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
 
         if (!stream)
             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
-    }
-    else if (elemSize == 4)
+    }//end if (isNppiNativelySupported)
+    else if (isElemSizeSupportedByNppi)
     {
-        gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
-    }
-    else // if (elemSize == 8)
+      NppStreamHandler h(StreamAccessor::getStream(stream));
+
+      NppiSize sz;
+      sz.width  = src.cols;
+      sz.height = src.rows;
+
+      if (!(elemSize%1) && ((elemSize/1)==1))
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%1) && ((elemSize/1)==2))
+        nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%1) && ((elemSize/1)==3))
+        nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%1) && ((elemSize/1)==4))
+        nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%2) && ((elemSize/2)==1))
+        nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%2) && ((elemSize/2)==2))
+        nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%2) && ((elemSize/2)==3))
+        nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%2) && ((elemSize/2)==4))
+        nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%4) && ((elemSize/4)==1))
+        nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%4) && ((elemSize/4)==2))
+        nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%4) && ((elemSize/4)==3))
+        nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%4) && ((elemSize/4)==4))
+        nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%8) && ((elemSize/8)==1))
+        nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%8) && ((elemSize/8)==2))
+        nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+
+      if (!stream)
+        CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }//end if (isElemSizeSupportedByNppi)
+    else if (isElemSizeSupportedByGridTranspose)
     {
+      if (elemSize == 1)
+        gridTranspose(globPtr<unsigned char>(src), globPtr<unsigned char>(dst), stream);
+      else if (elemSize == 2)
+        gridTranspose(globPtr<unsigned short>(src), globPtr<unsigned short>(dst), stream);
+      else if (elemSize == 4)
+        gridTranspose(globPtr<signed int>(src), globPtr<signed int>(dst), stream);
+      else if (elemSize == 8)
         gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
-    }
+    }//end if (isElemSizeSupportedByGridTranspose)
 
     syncOutput(dst, _dst, stream);
 }

diff --git a/modules/cudaarithm/test/test_core.cpp b/modules/cudaarithm/test/test_core.cpp
@@ -231,12 +231,31 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC2),
+                    MatType(CV_8UC3),
                     MatType(CV_8UC4),
+                    MatType(CV_8SC1),
+                    MatType(CV_8SC2),
+                    MatType(CV_8SC3),
+                    MatType(CV_8SC4),
+                    MatType(CV_16UC1),
                     MatType(CV_16UC2),
+                    MatType(CV_16UC3),
+                    MatType(CV_16UC4),
+                    MatType(CV_16SC1),
                     MatType(CV_16SC2),
+                    MatType(CV_16SC3),
+                    MatType(CV_16SC4),
                     MatType(CV_32SC1),
                     MatType(CV_32SC2),
-                    MatType(CV_64FC1)),
+                    MatType(CV_32SC3),
+                    MatType(CV_32SC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC2),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4),
+                    MatType(CV_64FC1),
+                    MatType(CV_64FC2)),
     WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////