@@ -355,9 +355,13 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
355
355
__ESIMD_NS::cache_hint_L1<gpu::xetla::detail::get_cache_hint (L1H)>,
356
356
__ESIMD_NS::cache_hint_L2<gpu::xetla::detail::get_cache_hint (L2H)>,
357
357
__ESIMD_NS::alignment<alignment>};
358
- if constexpr (sizeof (T) * N < sizeof (uint32_t )) {
359
- xetla_vector<uint32_t , N> offsets (byte_offset, sizeof (T));
360
- return __ESIMD_NS::gather<T, N, uint32_t >(ptr, offsets);
358
+ if constexpr (sizeof (T) * N < sizeof (uint32_t ) || N == 1 ) {
359
+ xetla_vector<T, N> ret;
360
+ #pragma unroll
361
+ for (uint32_t i = 0 ; i < N; i++) {
362
+ ret[i] = ptr[i + byte_offset / sizeof (T)];
363
+ }
364
+ return ret;
361
365
} else {
362
366
return __ESIMD_NS::block_load<T, N>(ptr, byte_offset, props);
363
367
}
@@ -501,9 +505,11 @@ __XETLA_API void xetla_store_global(
501
505
__ESIMD_NS::cache_hint_L2<gpu::xetla::detail::get_cache_hint (L2H)>,
502
506
__ESIMD_NS::alignment<alignment>};
503
507
504
- if constexpr (sizeof (T) * N < sizeof (uint32_t )) {
505
- xetla_vector<uint32_t , N> offsets (byte_offset, sizeof (T));
506
- return __ESIMD_NS::scatter<T, N, uint32_t >(ptr, offsets, vals);
508
+ if constexpr (sizeof (T) * N < sizeof (uint32_t ) || N == 1 ) {
509
+ #pragma unroll
510
+ for (uint32_t i = 0 ; i < N; i++) {
511
+ ptr[i + byte_offset / sizeof (T)] = vals[i];
512
+ }
507
513
} else {
508
514
__ESIMD_NS::block_store<T, N>(ptr, byte_offset, vals, props);
509
515
}
0 commit comments