update curand sequence for ChASE

brunowu · brunowu · commit 670dad3de4ee · 2023-03-18T16:33:03.000+01:00
diff --git a/ChASE-MPI/CMakeLists.txt b/ChASE-MPI/CMakeLists.txt
@@ -94,7 +94,6 @@ if(SCALAPACK_FOUND)
 	)
 endif()
 
-
 if(OpenMP_FOUND)
     target_link_libraries( chase_mpi INTERFACE
             OpenMP::OpenMP_CXX
diff --git a/ChASE-MPI/chase_mpi.hpp b/ChASE-MPI/chase_mpi.hpp
@@ -507,14 +507,6 @@ class ChaseMpi : public chase::Chase<T>
 #ifdef USE_NSIGHT
         nvtxRangePop();
 #endif
-        /*
-        std::mt19937 gen(2342.0);
-        std::normal_distribution<> normal_distribution;
-
-        for (std::size_t k = 0; k < N_; ++k){
-          v1[k] = getRandomT<T>([&]() { return normal_distribution(gen); });
-        }
-        */
 #ifdef HAS_OMP
         char* omp_threads;
         omp_threads = getenv("OMP_NUM_THREADS");
@@ -587,7 +579,7 @@ class ChaseMpi : public chase::Chase<T>
 #endif
 #ifdef HAS_OMP
         omp_set_num_threads(num_threads);
-#endif
+#endif	
         delete[] ritzv;
         delete[] isuppz;
         delete[] d;
@@ -644,7 +636,7 @@ class ChaseMpi : public chase::Chase<T>
             num_threads = std::atoi(omp_threads);
         }
         omp_set_num_threads(1);
-#endif
+#endif	
         // ENSURE that v1 has one norm
 #ifdef USE_NSIGHT
         nvtxRangePushA("Lanczos: loop");
@@ -706,7 +698,7 @@ class ChaseMpi : public chase::Chase<T>
         }
 #ifdef HAS_OMP
         omp_set_num_threads(num_threads);
-#endif
+#endif	
         delete[] isuppz;
         delete[] d;
         delete[] e;
diff --git a/ChASE-MPI/impl/chase_mpidla_blaslapack.hpp b/ChASE-MPI/impl/chase_mpidla_blaslapack.hpp
@@ -275,7 +275,7 @@ class ChaseMpiDLABlaslapack : public ChaseMpiDLAInterface<T>
             num_threads = std::atoi(omp_threads);
         }
         omp_set_num_threads(1);
-#endif        
+#endif   	    
         for (auto i = 0; i < unconverged; i++)
         {
             T alpha = -ritzv[i];
@@ -286,7 +286,7 @@ class ChaseMpiDLABlaslapack : public ChaseMpiDLAInterface<T>
         }
 #ifdef HAS_OMP
         omp_set_num_threads(num_threads);
-#endif    
+#endif   	
     }
 
     //! - This function performs the local computation for ChaseMpiDLA::heevd()
diff --git a/ChASE-MPI/impl/chase_mpidla_mgpu.hpp b/ChASE-MPI/impl/chase_mpidla_mgpu.hpp
@@ -40,7 +40,7 @@
 //! generated numbers
 //! @param[in] stream_ an asynchronous CUDA stream which allows to run this
 //! function asynchronously
-void chase_rand_normal(unsigned long long seed, curandState* states, float* v,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, float* v,
                        int n, cudaStream_t stream_);
 //! generate `n` random double numbers in normal distribution on each GPU
 //! device.
@@ -51,7 +51,7 @@ void chase_rand_normal(unsigned long long seed, curandState* states, float* v,
 //! generated numbers
 //! @param[in] stream_ an asynchronous CUDA stream which allows to run this
 //! function asynchronously
-void chase_rand_normal(unsigned long long seed, curandState* states, double* v,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, double* v,
                        int n, cudaStream_t stream_);
 //! generate `n` random complex float numbers in normal distribution on each GPU
 //! device. The real part and the imaginary part of each individual random
@@ -63,7 +63,7 @@ void chase_rand_normal(unsigned long long seed, curandState* states, double* v,
 //! generated numbers
 //! @param[in] stream_ an asynchronous CUDA stream which allows to run this
 //! function asynchronously
-void chase_rand_normal(unsigned long long seed, curandState* states,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                        std::complex<float>* v, int n, cudaStream_t stream_);
 //! generate `n` random complex double numbers in normal distribution on each
 //! GPU device. The real part and the imaginary part of each individual random
@@ -75,7 +75,7 @@ void chase_rand_normal(unsigned long long seed, curandState* states,
 //! generated numbers
 //! @param[in] stream_ an asynchronous CUDA stream which allows to run this
 //! function asynchronously
-void chase_rand_normal(unsigned long long seed, curandState* states,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                        std::complex<double>* v, int n, cudaStream_t stream_);
 
 //! shift the diagonal of a `nxn` square matrix `A` in float real data type.
@@ -258,7 +258,7 @@ class ChaseMpiDLAMultiGPU : public ChaseMpiDLAInterface<T>
         cuda_exec(
             cudaMalloc((void**)&d_ritz_, sizeof(Base<T>) * (nev_ + nex_)));
         cuda_exec(
-            cudaMalloc((void**)&states_, sizeof(curandState) * (256 * 32)));
+            cudaMalloc((void**)&states_, sizeof(curandStatePhilox4_32_10_t) * (256 * 32)));
 
         cublasCreate(&cublasH_);
         cusolverDnCreate(&cusolverH_);
@@ -786,7 +786,8 @@ class ChaseMpiDLAMultiGPU : public ChaseMpiDLAInterface<T>
         stream1_; //!< CUDA stream for asynchronous exectution of kernels
     cudaStream_t
         stream2_; //!< CUDA stream for asynchronous exectution of kernels
-    curandState* states_ = NULL; //!< a pointer of `curandState` for the cuRAND
+    //curandState* states_ = NULL; //!< a pointer of `curandState` for the cuRAND
+    curandStatePhilox4_32_10_t *states_ = NULL;
     T* d_H_;  //!< a pointer to a local buffer of size `m_*n_` on GPU, which is
               //!< mapped to `H_`.
     T* d_C_;  //!< a pointer to a local buffer of size `m_*(nev_+nex_)` on GPU,
diff --git a/ChASE-MPI/kernels/shift.cu b/ChASE-MPI/kernels/shift.cu
@@ -16,11 +16,11 @@
 #define GRIDDIM 32
 
 // generate `n` random float numbers on GPU
-__global__ void s_normal_kernel(unsigned long long seed, curandState* states,
+__global__ void s_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                                 float* v, int n)
 {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState* state = states + tid;
+    curandStatePhilox4_32_10_t* state = states + tid;
     curand_init(seed, tid, 0, state);
 
     int i;
@@ -33,11 +33,11 @@ __global__ void s_normal_kernel(unsigned long long seed, curandState* states,
 }
 
 // generate `n` random double numbers on GPU
-__global__ void d_normal_kernel(unsigned long long seed, curandState* states,
+__global__ void d_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                                 double* v, int n)
 {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState* state = states + tid;
+    curandStatePhilox4_32_10_t* state = states + tid;
     curand_init(seed, tid, 0, state);
 
     int i;
@@ -49,11 +49,11 @@ __global__ void d_normal_kernel(unsigned long long seed, curandState* states,
     }
 }
 // generate `n` random complex single numbers on GPU
-__global__ void c_normal_kernel(unsigned long long seed, curandState* states,
+__global__ void c_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                                 cuComplex* v, int n)
 {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState* state = states + tid;
+    curandStatePhilox4_32_10_t* state = states + tid;
     curand_init(seed, tid, 0, state);
 
     int i;
@@ -68,11 +68,11 @@ __global__ void c_normal_kernel(unsigned long long seed, curandState* states,
 }
 
 // generate `n` random complex double numbers on GPU
-__global__ void z_normal_kernel(unsigned long long seed, curandState* states,
+__global__ void z_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                                 cuDoubleComplex* v, int n)
 {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState* state = states + tid;
+    curandStatePhilox4_32_10_t* state = states + tid;
     curand_init(seed, tid, 0, state);
 
     int i;
@@ -166,26 +166,26 @@ __global__ void zshift_mgpu_matrix(cuDoubleComplex* A, std::size_t* off_m,
     }
 }
 
-void chase_rand_normal(unsigned long long seed, curandState* states, float* v,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, float* v,
                        int n, cudaStream_t stream_)
 {
     s_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(seed, states, v, n);
 }
 
-void chase_rand_normal(unsigned long long seed, curandState* states, double* v,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, double* v,
                        int n, cudaStream_t stream_)
 {
     d_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(seed, states, v, n);
 }
 
-void chase_rand_normal(unsigned long long seed, curandState* states,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                        std::complex<float>* v, int n, cudaStream_t stream_)
 {
     c_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(
         seed, states, reinterpret_cast<cuComplex*>(v), n);
 }
 
-void chase_rand_normal(unsigned long long seed, curandState* states,
+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,
                        std::complex<double>* v, int n, cudaStream_t stream_)
 {
     z_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,6 @@ if(SCALAPACK_FOUND)`
`94`	`94`	`)`
`95`	`95`	`endif()`
`96`	`96`
`97`		`-`
`98`	`97`	`if(OpenMP_FOUND)`
`99`	`98`	`target_link_libraries( chase_mpi INTERFACE`
`100`	`99`	`OpenMP::OpenMP_CXX`
Original file line number	Diff line number	Diff line change
`@@ -275,7 +275,7 @@ class ChaseMpiDLABlaslapack : public ChaseMpiDLAInterface<T>`
`275`	`275`	`num_threads = std::atoi(omp_threads);`
`276`	`276`	`}`
`277`	`277`	`omp_set_num_threads(1);`
`278`		`-#endif`
	`278`	`+#endif`
`279`	`279`	`for (auto i = 0; i < unconverged; i++)`
`280`	`280`	`{`
`281`	`281`	`T alpha = -ritzv[i];`
`@@ -286,7 +286,7 @@ class ChaseMpiDLABlaslapack : public ChaseMpiDLAInterface<T>`
`286`	`286`	`}`
`287`	`287`	`#ifdef HAS_OMP`
`288`	`288`	`omp_set_num_threads(num_threads);`
`289`		`-#endif`
	`289`	`+#endif`
`290`	`290`	`}`
`291`	`291`
`292`	`292`	`//! - This function performs the local computation for ChaseMpiDLA::heevd()`
Original file line number	Diff line number	Diff line change
`@@ -16,11 +16,11 @@`
`16`	`16`	`#define GRIDDIM 32`
`17`	`17`
`18`	`18`	// generate `n` random float numbers on GPU
`19`		`-__global__ void s_normal_kernel(unsigned long long seed, curandState* states,`
	`19`	`+__global__ void s_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,`
`20`	`20`	`float* v, int n)`
`21`	`21`	`{`
`22`	`22`	`int tid = blockIdx.x * blockDim.x + threadIdx.x;`
`23`		`- curandState* state = states + tid;`
	`23`	`+ curandStatePhilox4_32_10_t* state = states + tid;`
`24`	`24`	`curand_init(seed, tid, 0, state);`
`25`	`25`
`26`	`26`	`int i;`
`@@ -33,11 +33,11 @@ __global__ void s_normal_kernel(unsigned long long seed, curandState* states,`
`33`	`33`	`}`
`34`	`34`
`35`	`35`	// generate `n` random double numbers on GPU
`36`		`-__global__ void d_normal_kernel(unsigned long long seed, curandState* states,`
	`36`	`+__global__ void d_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,`
`37`	`37`	`double* v, int n)`
`38`	`38`	`{`
`39`	`39`	`int tid = blockIdx.x * blockDim.x + threadIdx.x;`
`40`		`- curandState* state = states + tid;`
	`40`	`+ curandStatePhilox4_32_10_t* state = states + tid;`
`41`	`41`	`curand_init(seed, tid, 0, state);`
`42`	`42`
`43`	`43`	`int i;`
`@@ -49,11 +49,11 @@ __global__ void d_normal_kernel(unsigned long long seed, curandState* states,`
`49`	`49`	`}`
`50`	`50`	`}`
`51`	`51`	// generate `n` random complex single numbers on GPU
`52`		`-__global__ void c_normal_kernel(unsigned long long seed, curandState* states,`
	`52`	`+__global__ void c_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,`
`53`	`53`	`cuComplex* v, int n)`
`54`	`54`	`{`
`55`	`55`	`int tid = blockIdx.x * blockDim.x + threadIdx.x;`
`56`		`- curandState* state = states + tid;`
	`56`	`+ curandStatePhilox4_32_10_t* state = states + tid;`
`57`	`57`	`curand_init(seed, tid, 0, state);`
`58`	`58`
`59`	`59`	`int i;`
`@@ -68,11 +68,11 @@ __global__ void c_normal_kernel(unsigned long long seed, curandState* states,`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	// generate `n` random complex double numbers on GPU
`71`		`-__global__ void z_normal_kernel(unsigned long long seed, curandState* states,`
	`71`	`+__global__ void z_normal_kernel(unsigned long long seed, curandStatePhilox4_32_10_t* states,`
`72`	`72`	`cuDoubleComplex* v, int n)`
`73`	`73`	`{`
`74`	`74`	`int tid = blockIdx.x * blockDim.x + threadIdx.x;`
`75`		`- curandState* state = states + tid;`
	`75`	`+ curandStatePhilox4_32_10_t* state = states + tid;`
`76`	`76`	`curand_init(seed, tid, 0, state);`
`77`	`77`
`78`	`78`	`int i;`
`@@ -166,26 +166,26 @@ __global__ void zshift_mgpu_matrix(cuDoubleComplex* A, std::size_t* off_m,`
`166`	`166`	`}`
`167`	`167`	`}`
`168`	`168`
`169`		`-void chase_rand_normal(unsigned long long seed, curandState* states, float* v,`
	`169`	`+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, float* v,`
`170`	`170`	`int n, cudaStream_t stream_)`
`171`	`171`	`{`
`172`	`172`	`s_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(seed, states, v, n);`
`173`	`173`	`}`
`174`	`174`
`175`		`-void chase_rand_normal(unsigned long long seed, curandState* states, double* v,`
	`175`	`+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states, double* v,`
`176`	`176`	`int n, cudaStream_t stream_)`
`177`	`177`	`{`
`178`	`178`	`d_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(seed, states, v, n);`
`179`	`179`	`}`
`180`	`180`
`181`		`-void chase_rand_normal(unsigned long long seed, curandState* states,`
	`181`	`+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,`
`182`	`182`	`std::complex<float>* v, int n, cudaStream_t stream_)`
`183`	`183`	`{`
`184`	`184`	`c_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(`
`185`	`185`	`seed, states, reinterpret_cast<cuComplex*>(v), n);`
`186`	`186`	`}`
`187`	`187`
`188`		`-void chase_rand_normal(unsigned long long seed, curandState* states,`
	`188`	`+void chase_rand_normal(unsigned long long seed, curandStatePhilox4_32_10_t* states,`
`189`	`189`	`std::complex<double>* v, int n, cudaStream_t stream_)`
`190`	`190`	`{`
`191`	`191`	`z_normal_kernel<<<GRIDDIM, BLOCKDIM, 0, stream_>>>(`