@@ -132,7 +132,8 @@ __device__ void u16_transpose_pack_u64(U64 *out_a, U64 *out_b, U16 *in_a,
132
132
133
133
// Performs the transpose for a and b in parallel
134
134
__device__ void u32_transpose_pack_u64 (U64 *out_a, U64 *out_b, U32 *in_a,
135
- U32 *in_b, size_t in_len, size_t out_len) {
135
+ U32 *in_b, size_t in_len,
136
+ size_t out_len) {
136
137
// in has size in_len = 64 * n
137
138
// out has size out_len, where each element is an array of n elements
138
139
// Thus out itslef has n * out_len elements (split into n arrays)
@@ -168,10 +169,10 @@ __device__ void lift_mul_sub(U32 *mask, U16 *mask_corr1, U16 *mask_corr2,
168
169
*mask -= (U32)(*mask_corr1) << 16 ;
169
170
*mask -= (U32)(*mask_corr2) << 17 ;
170
171
171
- U32 a ;
172
- mul_lift_b (&a , code);
172
+ U32 lifted ;
173
+ mul_lift_b (&lifted , code);
173
174
*mask *= A;
174
- *mask -= a ;
175
+ *mask -= lifted ;
175
176
}
176
177
177
178
__device__ void split_inner (U64 *x1_a, U64 *x1_b, U64 *x2_a, U64 *x2_b,
@@ -322,11 +323,21 @@ extern "C" __global__ void lift_split(U16 *in_a, U16 *in_b, U32 *lifted_a,
322
323
extern " C" __global__ void shared_lift_mul_sub (U32 *mask_a, U32 *mask_b,
323
324
U16 *mask_corr_a,
324
325
U16 *mask_corr_b, U16 *code_a,
325
- U16 *code_b, size_t n) {
326
+ U16 *code_b, int id, size_t n) {
326
327
size_t i = blockIdx .x * blockDim .x + threadIdx .x ;
327
328
if (i < n) {
328
329
lift_mul_sub (&mask_a[i], &mask_corr_a[i], &mask_corr_a[i + n], &code_a[i]);
329
330
lift_mul_sub (&mask_b[i], &mask_corr_b[i], &mask_corr_b[i + n], &code_b[i]);
331
+ switch (id) {
332
+ case 0 :
333
+ mask_a[i] += 1 ; // Transforms the <= into <
334
+ break ;
335
+ case 1 :
336
+ mask_b[i] += 1 ; // Transforms the <= into <
337
+ break ;
338
+ default :
339
+ break ;
340
+ }
330
341
}
331
342
}
332
343
0 commit comments