Skip to content

Commit 89649df

Browse files
committed
Merge branch 'shift_H_mgpu_fix_bug' into 'master'
bug fixed for the indexing of large matrix on GPU See merge request SLai/ChASE!22
2 parents a7ce5fe + 5577bca commit 89649df

File tree

3 files changed

+43
-43
lines changed

3 files changed

+43
-43
lines changed

ChASE-MPI/impl/chase_mpidla_mgpu.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,20 @@
2222
#include "chase_mpidla_interface.hpp"
2323
#include "mgpu_cudaDLA.hpp"
2424

25-
void chase_shift_mgpu_matrix(float* A, int* off_m, int* off_n,
26-
int offsize, int ldH, float shift,
25+
void chase_shift_mgpu_matrix(float* A, std::size_t* off_m, std::size_t* off_n,
26+
std::size_t offsize, std::size_t ldH, float shift,
2727
cudaStream_t stream_);
2828

29-
void chase_shift_mgpu_matrix(double* A, int* off_m, int* off_n,
30-
int offsize, int ldH, double shift,
29+
void chase_shift_mgpu_matrix(double* A, std::size_t* off_m, std::size_t* off_n,
30+
std::size_t offsize, std::size_t ldH, double shift,
3131
cudaStream_t stream_);
3232

33-
void chase_shift_mgpu_matrix(std::complex<double>* A, int* off_m, int* off_n,
34-
int offsize, int ldH, double shift,
33+
void chase_shift_mgpu_matrix(std::complex<double>* A, std::size_t* off_m, std::size_t* off_n,
34+
std::size_t offsize, std::size_t ldH, double shift,
3535
cudaStream_t stream_);
3636

37-
void chase_shift_mgpu_matrix(std::complex<float>* A, int* off_m, int* off_n,
38-
int offsize, int ldH, float shift,
37+
void chase_shift_mgpu_matrix(std::complex<float>* A, std::size_t* off_m, std::size_t* off_n,
38+
std::size_t offsize, std::size_t ldH, float shift,
3939
cudaStream_t stream_);
4040

4141

ChASE-MPI/impl/mgpu_cudaDLA.hpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,9 @@ namespace chase {
185185

186186

187187
//for shifting matrix on gpus
188-
int start_row, start_col;
189-
d_off_m_ = (int**) malloc(num_devices_per_rank * sizeof(int*));
190-
d_off_n_ = (int**) malloc(num_devices_per_rank * sizeof(int*));
188+
std::size_t start_row, start_col;
189+
d_off_m_ = (std::size_t**) malloc(num_devices_per_rank * sizeof(std::size_t*));
190+
d_off_n_ = (std::size_t**) malloc(num_devices_per_rank * sizeof(std::size_t*));
191191

192192
for (int dev_x = 0; dev_x < ntile_m_; dev_x++){
193193
tile_x = get_tile_size_row(dev_x);
@@ -197,15 +197,15 @@ namespace chase {
197197
tile_y = get_tile_size_col(dev_y);
198198
start_col = dev_y * dim_tile_n_;
199199
int dev_id = dev_x * ntile_n_ + dev_y;
200-
std::vector<int> off_m, off_n;
200+
std::vector<std::size_t> off_m, off_n;
201201

202202
for(std::size_t j = 0; j < nblocks_; j++){
203203
for(std::size_t i = 0; i < mblocks_; i++){
204204
for(std::size_t q = 0; q < c_lens_[j]; q++){
205205
for(std::size_t p = 0; p < r_lens_[i]; p++){
206206

207207
if(q + c_offs_l_[j] >= start_col && q + c_offs_l_[j] < start_col + tile_y && p + r_offs_l_[i] >= start_row && p + r_offs_l_[i] < start_row + tile_x){
208-
int s, t;
208+
std::size_t s, t;
209209
//t, s, global index
210210
t = q + c_offs_[j];
211211
s = p + r_offs_[i];
@@ -221,13 +221,13 @@ namespace chase {
221221
}
222222
}
223223

224-
int off_size = off_m.size();
224+
std::size_t off_size = off_m.size();
225225
diagonal_offs_.push_back(off_size);
226226
cuda_exec(cudaSetDevice(shmrank_*num_devices_per_rank + dev_id));
227-
cudaMalloc(&d_off_m_[dev_id], off_size * sizeof(int));
228-
cudaMalloc(&d_off_n_[dev_id], off_size * sizeof(int));
229-
cudaMemcpy(d_off_m_[dev_id], off_m.data(), off_size* sizeof(int), cudaMemcpyHostToDevice);
230-
cudaMemcpy(d_off_n_[dev_id], off_n.data(), off_size* sizeof(int), cudaMemcpyHostToDevice);
227+
cudaMalloc(&d_off_m_[dev_id], off_size * sizeof(std::size_t));
228+
cudaMalloc(&d_off_n_[dev_id], off_size * sizeof(std::size_t));
229+
cudaMemcpy(d_off_m_[dev_id], off_m.data(), off_size* sizeof(std::size_t), cudaMemcpyHostToDevice);
230+
cudaMemcpy(d_off_n_[dev_id], off_n.data(), off_size* sizeof(std::size_t), cudaMemcpyHostToDevice);
231231

232232
}
233233
}
@@ -377,7 +377,7 @@ namespace chase {
377377
int tile_x, tile_y;
378378
int count_x = 0, count_y = 0;
379379

380-
int start_row, start_col;
380+
std::size_t start_row, start_col;
381381

382382
for (int dev_x = 0; dev_x < ntile_m_; dev_x++){
383383
tile_x = get_tile_size_row(dev_x);
@@ -388,7 +388,7 @@ namespace chase {
388388
start_col = dev_y * dim_tile_n_;
389389
int dev_id = dev_x * ntile_n_ + dev_y;
390390

391-
int off_size = diagonal_offs_[dev_id];
391+
std::size_t off_size = diagonal_offs_[dev_id];
392392

393393
cuda_exec(cudaSetDevice(shmrank_*num_devices_per_rank + dev_id));
394394
chase_shift_mgpu_matrix(H_[dev_id], d_off_m_[dev_id], d_off_n_[dev_id], off_size, ldH, std::real(c), stream_[dev_id]);
@@ -971,9 +971,9 @@ namespace chase {
971971
std::size_t mblocks_;
972972

973973
//for shifting matrix on gpus
974-
int **d_off_m_ = nullptr;
975-
int **d_off_n_ = nullptr;
976-
std::vector<int> diagonal_offs_;
974+
std::size_t **d_off_m_ = nullptr;
975+
std::size_t **d_off_n_ = nullptr;
976+
std::vector<std::size_t> diagonal_offs_;
977977

978978
/// Return the number of rows of the tile with row-index 'tile_position'
979979
int get_tile_size_row (int tile_position) {

ChASE-MPI/kernels/shift.cu

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,40 +33,40 @@ __global__ void zshift_matrix(cuDoubleComplex* A, int n, double shift) {
3333
if (idx < n) A[(idx)*n + idx].x += shift;
3434
}
3535

36-
__global__ void sshift_mgpu_matrix(float* A, int* off_m, int* off_n,
37-
int offsize, int ldH, float shift) {
36+
__global__ void sshift_mgpu_matrix(float* A, std::size_t* off_m, std::size_t* off_n,
37+
std::size_t offsize, std::size_t ldH, float shift) {
3838
int i = blockIdx.x * blockDim.x + threadIdx.x;
39-
int ind;
39+
std::size_t ind;
4040
if(i < offsize){
4141
ind = off_n[i] * ldH + off_m[i];
4242
A[ind] += shift;
4343
}
4444
}
4545

46-
__global__ void dshift_mgpu_matrix(double* A, int* off_m, int* off_n,
47-
int offsize, int ldH, double shift) {
46+
__global__ void dshift_mgpu_matrix(double* A, std::size_t* off_m, std::size_t* off_n,
47+
std::size_t offsize, std::size_t ldH, double shift) {
4848
int i = blockIdx.x * blockDim.x + threadIdx.x;
49-
int ind;
49+
std::size_t ind;
5050
if(i < offsize){
5151
ind = off_n[i] * ldH + off_m[i];
5252
A[ind] += shift;
5353
}
5454
}
5555

56-
__global__ void cshift_mgpu_matrix(cuComplex* A, int* off_m, int* off_n,
57-
int offsize, int ldH, float shift) {
56+
__global__ void cshift_mgpu_matrix(cuComplex* A, std::size_t* off_m, std::size_t* off_n,
57+
std::size_t offsize, std::size_t ldH, float shift) {
5858
int i = blockIdx.x * blockDim.x + threadIdx.x;
59-
int ind;
59+
std::size_t ind;
6060
if(i < offsize){
6161
ind = off_n[i] * ldH + off_m[i];
6262
A[ind].x += shift;
6363
}
6464
}
6565

66-
__global__ void zshift_mgpu_matrix(cuDoubleComplex* A, int* off_m, int* off_n,
67-
int offsize, int ldH, double shift) {
66+
__global__ void zshift_mgpu_matrix(cuDoubleComplex* A, std::size_t* off_m, std::size_t* off_n,
67+
std::size_t offsize, std::size_t ldH, double shift) {
6868
int i = blockIdx.x * blockDim.x + threadIdx.x;
69-
int ind;
69+
std::size_t ind;
7070
if(i < offsize){
7171
ind = off_n[i] * ldH + off_m[i];
7272
A[ind].x += shift;
@@ -101,8 +101,8 @@ void chase_shift_matrix(std::complex<double>* A, int n, double shift,
101101
reinterpret_cast<cuDoubleComplex*>(A), n, shift);
102102
}
103103

104-
void chase_shift_mgpu_matrix(float* A, int* off_m, int* off_n,
105-
int offsize, int ldH, float shift,
104+
void chase_shift_mgpu_matrix(float* A, std::size_t* off_m, std::size_t* off_n,
105+
std::size_t offsize, std::size_t ldH, float shift,
106106
cudaStream_t stream_) {
107107

108108
unsigned int grid = (offsize + 256 - 1) / 256;
@@ -114,8 +114,8 @@ void chase_shift_mgpu_matrix(float* A, int* off_m, int* off_n,
114114
}
115115

116116

117-
void chase_shift_mgpu_matrix(double* A, int* off_m, int* off_n,
118-
int offsize, int ldH, double shift,
117+
void chase_shift_mgpu_matrix(double* A, std::size_t* off_m, std::size_t* off_n,
118+
std::size_t offsize, std::size_t ldH, double shift,
119119
cudaStream_t stream_) {
120120

121121
unsigned int grid = (offsize + 256 - 1) / 256;
@@ -126,8 +126,8 @@ void chase_shift_mgpu_matrix(double* A, int* off_m, int* off_n,
126126

127127
}
128128

129-
void chase_shift_mgpu_matrix(std::complex<float>* A, int* off_m, int* off_n,
130-
int offsize, int ldH, float shift,
129+
void chase_shift_mgpu_matrix(std::complex<float>* A, std::size_t* off_m, std::size_t* off_n,
130+
std::size_t offsize, std::size_t ldH, float shift,
131131
cudaStream_t stream_) {
132132

133133
unsigned int grid = (offsize + 256 - 1) / 256;
@@ -140,8 +140,8 @@ void chase_shift_mgpu_matrix(std::complex<float>* A, int* off_m, int* off_n,
140140
}
141141

142142

143-
void chase_shift_mgpu_matrix(std::complex<double>* A, int* off_m, int* off_n,
144-
int offsize, int ldH, double shift,
143+
void chase_shift_mgpu_matrix(std::complex<double>* A, std::size_t* off_m, std::size_t* off_n,
144+
std::size_t offsize, std::size_t ldH, double shift,
145145
cudaStream_t stream_) {
146146

147147
unsigned int grid = (offsize + 256 - 1) / 256;

0 commit comments

Comments
 (0)