Open
Description
We can rewrite the whole "pipeline" as a single C 99 version:
void pipeline_kernel_c99(unsigned long start, unsigned long end, unsigned long *sum_out, unsigned long *count_out) {
unsigned long sum = 0;
unsigned long count = 0;
unsigned long max_power_of_three = 12157665459056928801ull;
for (unsigned long value = start; value <= end; ++value) {
// Check if the value is a power of two
if (value > 0 && (value & (value - 1)) == 0) continue;
// Check if the value is a power of three
if (value > 0 && max_power_of_three % value == 0) continue;
// Prime factorization
unsigned long n = value;
unsigned long factor = 2;
// Handle factor 2 separately
while (n % 2 == 0) {
sum += 2;
count += 1;
n /= 2;
}
// Handle odd factors
factor = 3;
while (factor * factor <= n) {
while (n % factor == 0) {
sum += factor;
count += 1;
n /= factor;
}
factor += 2;
}
// If n is still greater than 1, it is a prime factor
if (n > 1) {
sum += n;
count += 1;
}
}
*sum_out = sum;
*count_out = count;
}
GCC 14.2 produces the following Assembly:
pipeline_kernel_c99(unsigned long, unsigned long, unsigned long*, unsigned long*):
push rbp
mov rbp, rcx
push rbx
mov rbx, rdx
cmp rsi, rdi
jb .L15
mov r9, rdi
mov r10, rsi
xor edi, edi
xor r8d, r8d
movabs r11, -6289078614652622815
jmp .L14
.L7:
add r9, 1
cmp r10, r9
jb .L2
.L14:
xor ecx, ecx
test r9, r9
je .L9
lea rax, [r9-1]
test rax, r9
je .L7
mov rax, r11
xor edx, edx
div r9
test rdx, rdx
je .L7
mov rcx, r9
test r9b, 1
jne .L39
.L9:
shr rcx
add r8, 2
add rdi, 1
test cl, 1
je .L9
cmp rcx, 8
jbe .L10
.L5:
mov esi, 3
.L11:
mov rax, rcx
xor edx, edx
div rsi
test rdx, rdx
jne .L13
.L12:
mov rax, rcx
xor edx, edx
add r8, rsi
add rdi, 1
div rsi
xor edx, edx
mov rcx, rax
div rsi
test rdx, rdx
je .L12
.L13:
add rsi, 2
mov rax, rsi
imul rax, rsi
cmp rcx, rax
jnb .L11
.L10:
cmp rcx, 1
je .L7
.L6:
add r9, 1
add r8, rcx
add rdi, 1
cmp r10, r9
jnb .L14
.L2:
mov QWORD PTR [rbx], r8
mov QWORD PTR [rbp+0], rdi
pop rbx
pop rbp
ret
.L39:
cmp r9, 8
ja .L5
jmp .L6
.L15:
xor edi, edi
xor r8d, r8d
mov QWORD PTR [rbx], r8
mov QWORD PTR [rbp+0], rdi
pop rbx
pop rbp
ret
It seems like this can be further optimized. Who's up for the task?