Skip to content

Commit 2822872

Browse files
committed
Extract Arith operation definitions to cuda file.
* During the extraction, blas operations now convert int to float.
1 parent b96b927 commit 2822872

File tree

3 files changed

+260
-131
lines changed

3 files changed

+260
-131
lines changed

src/gpu/kmeans/KmMatrix/Arith.cu

+163
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
#include "Arith.hpp"
2+
namespace H2O4GPU {
3+
namespace KMeans {
4+
5+
namespace kernel {
6+
7+
/*
8+
* Compute min value for each row.
9+
* @tparam T Numeric type of the data
10+
* @param _res The output matrix with shape m x 1
11+
* @param _val The input matrix with shape m x n
12+
*/
13+
template <typename T>
14+
__global__ void row_min_sequential(kParam<T> _res, kParam<T> _val) {
15+
16+
size_t idx = global_thread_idx();
17+
if (idx < _val.rows) {
18+
T min = std::numeric_limits<T>::max();
19+
for (size_t i = 0; i < _val.cols; ++i) {
20+
T value = _val.ptr[idx * _val.cols + i];
21+
if (value < min) {
22+
min = value;
23+
}
24+
}
25+
_res.ptr[idx] = min;
26+
}
27+
}
28+
29+
template <typename T>
30+
__global__ void row_argmin_sequential(kParam<int> _res, kParam<T> _val) {
31+
32+
size_t idx = global_thread_idx();
33+
if (idx < _val.rows) {
34+
T min = std::numeric_limits<T>::max();
35+
int min_idx = -1;
36+
for (size_t i = 0; i < _val.cols; ++i) {
37+
T value = _val.ptr[idx * _val.cols + i];
38+
if (value < min) {
39+
min = value;
40+
min_idx = i;
41+
}
42+
}
43+
_res.ptr[idx] = min_idx;
44+
}
45+
}
46+
47+
} // namespace kernel
48+
49+
template <typename T>
50+
void DotOp<T>::dot(KmMatrix<T>& _res, KmMatrix<T>& _val) {
51+
this->dot(_res, _val, _val);
52+
}
53+
template <typename T>
54+
void DotOp<T>::dot(KmMatrix<T>& _res, KmMatrix<T>& _lhs,
55+
KmMatrix<T>& _rhs) {
56+
constexpr T alpha = 1.0;
57+
constexpr T beta = 1.0;
58+
cublasHandle_t handle = GpuInfo::ins().cublas_handle();
59+
Blas::gemm(handle,
60+
CUBLAS_OP_N, CUBLAS_OP_N, // FIXME
61+
_lhs.rows(), _rhs.cols(), _lhs.cols(),
62+
&alpha,
63+
_lhs.dev_ptr(), _lhs.cols(),
64+
_rhs.dev_ptr(), _rhs.cols(),
65+
&beta,
66+
_res.dev_ptr(), _res.cols());
67+
}
68+
69+
template <typename T>
70+
void VecBatchDotOp<T>::dot(KmMatrix<T>& _res, KmMatrix<T>& _val) {
71+
this->dot(_res, _val, _val);
72+
}
73+
template <typename T>
74+
void VecBatchDotOp<T>::dot(KmMatrix<T>& _res, KmMatrix<T>& _lhs, KmMatrix<T>& _rhs) {
75+
constexpr T alpha = 1.0;
76+
constexpr T beta = 1.0;
77+
cublasHandle_t handle = GpuInfo::ins().cublas_handle();
78+
Blas::gemm_strided_batched(
79+
handle,
80+
CUBLAS_OP_N, CUBLAS_OP_T,
81+
1, 1, _rhs.cols(), // m, n, k
82+
&alpha,
83+
_lhs.dev_ptr(), 1, _lhs.cols(),
84+
_rhs.dev_ptr(), 1, _rhs.cols(),
85+
&beta,
86+
_res.dev_ptr(), _res.cols(), 1, // c should be columun vector
87+
_lhs.rows());
88+
}
89+
90+
template <typename T>
91+
T SumOp<T>::sum(KmMatrix<T>& _val) {
92+
T* raw_ptr = _val.dev_ptr();
93+
thrust::device_ptr<T> ptr (raw_ptr);
94+
T res = thrust::reduce(ptr, ptr + _val.size(), (T)0, thrust::plus<T>());
95+
return res;
96+
}
97+
98+
template <typename T>
99+
void MulOp<T>::mul(KmMatrix<T>& _res, KmMatrix<T>& _lhs, T _rhs) {
100+
cublasHandle_t handle = GpuInfo::ins().cublas_handle();
101+
Blas::axpy(
102+
handle, _lhs.size(), // handle, n
103+
&_rhs, // alpha
104+
_lhs.dev_ptr(), 1,
105+
_res.dev_ptr(), 1);
106+
}
107+
108+
template <typename T>
109+
T MeanOp<T>::mean(KmMatrix<T>& _val) {
110+
T res = SumOp<T>().sum(_val);
111+
res = res / _val.size();
112+
return res;
113+
}
114+
115+
template <typename T>
116+
KmMatrix<int> ArgMinOp<T>::argmin(KmMatrix<T>& _val, KmMatrixDim _dim) {
117+
if (_dim == KmMatrixDim::ROW) {
118+
KmMatrix<int> _res(_val.rows(), 1);
119+
kernel::row_argmin_sequential<<<div_roundup(_val.rows(), 256), 256>>>(
120+
_res.k_param(), _val.k_param());
121+
return _res;
122+
} else {
123+
// FIXME
124+
M_ERROR("Not implemented");
125+
}
126+
}
127+
128+
template <typename T>
129+
KmMatrix<T> MinOp<T>::min(KmMatrix<T>& _val, KmMatrixDim _dim) {
130+
size_t blocks = GpuInfo::ins().blocks(32);
131+
if (_dim == KmMatrixDim::ROW) {
132+
KmMatrix<T> _res(_val.rows(), 1);
133+
kernel::row_min_sequential<<<div_roundup(_val.rows(), 256), 256>>>(
134+
_res.k_param(), _val.k_param());
135+
return _res;
136+
} else {
137+
// FIXME
138+
M_ERROR("Not implemented");
139+
}
140+
}
141+
142+
#define INSTANTIATE(T) \
143+
template void DotOp<T>::dot(KmMatrix<T>& _res, KmMatrix<T>& _val); \
144+
template void DotOp<T>::dot(KmMatrix<T>& _res, KmMatrix<T>& _lhs, \
145+
KmMatrix<T>& _rhs); \
146+
template void VecBatchDotOp<T>::dot( \
147+
KmMatrix<T>& _res, KmMatrix<T>& _val); \
148+
template void VecBatchDotOp<T>::dot( \
149+
KmMatrix<T>& _res, KmMatrix<T>& _lhs, KmMatrix<T>& _rhs); \
150+
template T SumOp<T>::sum(KmMatrix<T>& _val); \
151+
template void MulOp<T>::mul(KmMatrix<T>& _res, KmMatrix<T>& _lhs, T _rhs); \
152+
template T MeanOp<T>::mean(KmMatrix<T>& _val); \
153+
template KmMatrix<int> ArgMinOp<T>::argmin( \
154+
KmMatrix<T>& _val, KmMatrixDim _dim); \
155+
template KmMatrix<T> MinOp<T>::min(KmMatrix<T>& _val, KmMatrixDim _dim); \
156+
157+
158+
INSTANTIATE(double)
159+
INSTANTIATE(float)
160+
INSTANTIATE(int)
161+
162+
} // namespace KMenas
163+
} // namespace H204GPU

src/gpu/kmeans/KmMatrix/Arith.hpp

+9-123
Original file line numberDiff line numberDiff line change
@@ -8,50 +8,6 @@
88
namespace H2O4GPU {
99
namespace KMeans {
1010

11-
namespace kernel {
12-
13-
/*
14-
* Compute min value for each row.
15-
* @tparam T Numeric type of the data
16-
* @param _res The output matrix with shape m x 1
17-
* @param _val The input matrix with shape m x n
18-
*/
19-
template <typename T>
20-
__global__ void row_min_sequential(kParam<T> _res, kParam<T> _val) {
21-
22-
size_t idx = global_thread_idx();
23-
if (idx < _val.rows) {
24-
T min = std::numeric_limits<T>::max();
25-
for (size_t i = 0; i < _val.cols; ++i) {
26-
T value = _val.ptr[idx * _val.cols + i];
27-
if (value < min) {
28-
min = value;
29-
}
30-
}
31-
_res.ptr[idx] = min;
32-
}
33-
}
34-
35-
template <typename T>
36-
__global__ void row_argmin_sequential(kParam<int> _res, kParam<T> _val) {
37-
38-
size_t idx = global_thread_idx();
39-
if (idx < _val.rows) {
40-
T min = std::numeric_limits<T>::max();
41-
int min_idx = -1;
42-
for (size_t i = 0; i < _val.cols; ++i) {
43-
T value = _val.ptr[idx * _val.cols + i];
44-
if (value < min) {
45-
min = value;
46-
min_idx = i;
47-
}
48-
}
49-
_res.ptr[idx] = min_idx;
50-
}
51-
}
52-
53-
} // namespace kernel
54-
5511
// FIXME: Using struct for operations is just keeping the possibility of
5612
// creating an unified operations for KmMatrix. For example, let KmMatrix
5713
// inherit those left associative ops, or create an inferface for elementwise
@@ -60,110 +16,40 @@ __global__ void row_argmin_sequential(kParam<int> _res, kParam<T> _val) {
6016
// FIXME: Use return value instead.
6117
template <typename T>
6218
struct DotOp {
63-
void dot(KmMatrix<T>& _res, KmMatrix<T>& _val) {
64-
this->dot(_res, _val, _val);
65-
}
66-
void dot(KmMatrix<T>& _res, KmMatrix<T>& _lhs,
67-
KmMatrix<T>& _rhs) {
68-
constexpr T alpha = 1.0;
69-
constexpr T beta = 1.0;
70-
cublasHandle_t handle = GpuInfo::ins().cublas_handle();
71-
Blas::gemm(handle,
72-
CUBLAS_OP_N, CUBLAS_OP_N, // FIXME
73-
_lhs.rows(), _rhs.cols(), _lhs.cols(),
74-
&alpha,
75-
_lhs.dev_ptr(), _lhs.cols(),
76-
_rhs.dev_ptr(), _rhs.cols(),
77-
&beta,
78-
_res.dev_ptr(), _res.cols());
79-
}
19+
void dot(KmMatrix<T>& _res, KmMatrix<T>& _val);
20+
void dot(KmMatrix<T>& _res, KmMatrix<T>& _lhs, KmMatrix<T>& _rhs);
8021
};
8122

8223
template <typename T>
8324
struct VecBatchDotOp {
84-
void dot(KmMatrix<T>& _res, KmMatrix<T>& _val) {
85-
this->dot(_res, _val, _val);
86-
}
87-
void dot(KmMatrix<T>& _res, KmMatrix<T>& _lhs, KmMatrix<T>& _rhs) {
88-
constexpr T alpha = 1.0;
89-
constexpr T beta = 1.0;
90-
cublasHandle_t handle = GpuInfo::ins().cublas_handle();
91-
Blas::gemm_strided_batched(
92-
handle,
93-
CUBLAS_OP_N, CUBLAS_OP_T,
94-
1, 1, _rhs.cols(), // m, n, k
95-
&alpha,
96-
_lhs.dev_ptr(), 1, _lhs.cols(),
97-
_rhs.dev_ptr(), 1, _rhs.cols(),
98-
&beta,
99-
_res.dev_ptr(), _res.cols(), 1, // c should be columun vector
100-
_lhs.rows());
101-
}
25+
void dot(KmMatrix<T>& _res, KmMatrix<T>& _val);
26+
void dot(KmMatrix<T>& _res, KmMatrix<T>& _lhs, KmMatrix<T>& _rhs);
10227
};
10328

10429
template <typename T>
10530
struct SumOp {
106-
T sum(KmMatrix<T>& _val) {
107-
T* raw_ptr = _val.dev_ptr();
108-
thrust::device_ptr<T> ptr (raw_ptr);
109-
T res = thrust::reduce(ptr, ptr + _val.size(), (T)0, thrust::plus<T>());
110-
return res;
111-
}
31+
T sum(KmMatrix<T>& _val);
11232
};
11333

11434
template <typename T>
11535
struct MulOp {
116-
void mul(KmMatrix<T>& _res, KmMatrix<T>& _lhs, T _rhs) {
117-
cublasHandle_t handle = GpuInfo::ins().cublas_handle();
118-
Blas::axpy(
119-
handle, _lhs.size(), // handle, n
120-
&_rhs, // alpha
121-
_lhs.dev_ptr(), 1,
122-
_res.dev_ptr(), 1);
123-
}
36+
void mul(KmMatrix<T>& _res, KmMatrix<T>& _lhs, T _rhs);
12437
};
12538

12639

12740
template <typename T>
12841
struct MeanOp {
129-
T mean(KmMatrix<T>& _val) {
130-
T res = SumOp<T>().sum(_val);
131-
res = res / _val.size();
132-
return res;
133-
}
42+
T mean(KmMatrix<T>& _val);
13443
};
13544

13645
template <typename T>
13746
struct ArgMinOp {
138-
139-
KmMatrix<int> argmin(KmMatrix<T>& _val, KmMatrixDim _dim) {
140-
if (_dim == KmMatrixDim::ROW) {
141-
KmMatrix<int> _res(_val.rows(), 1);
142-
kernel::row_argmin_sequential<<<div_roundup(_val.rows(), 256), 256>>>(
143-
_res.k_param(), _val.k_param());
144-
return _res;
145-
} else {
146-
// FIXME
147-
M_ERROR("Not implemented");
148-
}
149-
}
47+
KmMatrix<int> argmin(KmMatrix<T>& _val, KmMatrixDim _dim);
15048
};
15149

15250
template <typename T>
15351
struct MinOp {
154-
155-
KmMatrix<T> min(KmMatrix<T>& _val, KmMatrixDim _dim) {
156-
size_t blocks = GpuInfo::ins().blocks(32);
157-
if (_dim == KmMatrixDim::ROW) {
158-
KmMatrix<T> _res(_val.rows(), 1);
159-
kernel::row_min_sequential<<<div_roundup(_val.rows(), 256), 256>>>(
160-
_res.k_param(), _val.k_param());
161-
return _res;
162-
} else {
163-
// FIXME
164-
M_ERROR("Not implemented");
165-
}
166-
}
52+
KmMatrix<T> min(KmMatrix<T>& _val, KmMatrixDim _dim);
16753
};
16854

16955
} // namespace KMenas

0 commit comments

Comments
 (0)