Skip to content

Commit 4102c9b

Browse files
Ionut Hristodorescufacebook-github-bot
authored andcommitted
Perf improvement for Half4::store() (pytorch#4857)
Summary: X-link: facebookresearch/FBGEMM#1881 Storing 4xFP16's as 2xINT32's Differential Revision: D82149371
1 parent 23f944c commit 4102c9b

File tree

1 file changed

+3
-4
lines changed

1 file changed

+3
-4
lines changed

fbgemm_gpu/include/fbgemm_gpu/utils/float.cuh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,9 @@ struct Half4 {
5151

5252
__device__ inline void store(at::Half* p) {
5353
#ifdef USE_ROCM
54-
p[0] = __low2half(a);
55-
p[1] = __high2half(a);
56-
p[2] = __low2half(b);
57-
p[3] = __high2half(b);
54+
*reinterpret_cast<unsigned int*>(p) = *reinterpret_cast<unsigned int*>(&a);
55+
*reinterpret_cast<unsigned int*>(p + 2) =
56+
*reinterpret_cast<unsigned int*>(&b);
5857
#elif CUDA_VERSION >= 9000
5958

6059
#ifndef __HALF2_TO_UI

0 commit comments

Comments
 (0)