-
Notifications
You must be signed in to change notification settings - Fork 25.4k
libvec: unroll pragma and push stride down #107460
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,15 +18,7 @@ | |
#define SQR8S_STRIDE_BYTES_LEN 16 | ||
#endif | ||
|
||
EXPORT int dot8s_stride() { | ||
return DOT8_STRIDE_BYTES_LEN; | ||
} | ||
|
||
EXPORT int sqr8s_stride() { | ||
return SQR8S_STRIDE_BYTES_LEN; | ||
} | ||
|
||
EXPORT int32_t dot8s(int8_t* a, int8_t* b, size_t dims) { | ||
int32_t dot8s_inner(int8_t* a, int8_t* b, size_t dims) { | ||
// We have contention in the instruction pipeline on the accumulation | ||
// registers if we use too few. | ||
int32x4_t acc1 = vdupq_n_s32(0); | ||
|
@@ -35,6 +27,7 @@ EXPORT int32_t dot8s(int8_t* a, int8_t* b, size_t dims) { | |
int32x4_t acc4 = vdupq_n_s32(0); | ||
|
||
// Some unrolling gives around 50% performance improvement. | ||
#pragma clang loop unroll_count(2) | ||
for (int i = 0; i < dims; i += DOT8_STRIDE_BYTES_LEN) { | ||
// Read into 16 x 8 bit vectors. | ||
int8x16_t va1 = vld1q_s8(a + i); | ||
|
@@ -60,12 +53,26 @@ EXPORT int32_t dot8s(int8_t* a, int8_t* b, size_t dims) { | |
return vaddvq_s32(vaddq_s32(acc5, acc6)); | ||
} | ||
|
||
EXPORT int32_t sqr8s(int8_t *a, int8_t *b, size_t dims) { | ||
EXPORT int32_t dot8s(int8_t* a, int8_t* b, size_t dims) { | ||
int32_t res = 0; | ||
int i = 0; | ||
if (dims > DOT8_STRIDE_BYTES_LEN) { | ||
i += dims & ~(DOT8_STRIDE_BYTES_LEN - 1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This only works if DOT8_STRIDE_BYTES_LEN is a power of 2. Of course it will be. Perhaps it is worth enforcing this with a static_assert somewhere so people don't accidentally break it though, i.e. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ++ |
||
res = dot8s_inner(a, b, i); | ||
} | ||
for (; i < dims; i++) { | ||
res += a[i] * b[i]; | ||
} | ||
return res; | ||
} | ||
|
||
int32_t sqr8s_inner(int8_t *a, int8_t *b, size_t dims) { | ||
int32x4_t acc1 = vdupq_n_s32(0); | ||
int32x4_t acc2 = vdupq_n_s32(0); | ||
int32x4_t acc3 = vdupq_n_s32(0); | ||
int32x4_t acc4 = vdupq_n_s32(0); | ||
|
||
#pragma clang loop unroll_count(2) | ||
for (int i = 0; i < dims; i += SQR8S_STRIDE_BYTES_LEN) { | ||
int8x16_t va1 = vld1q_s8(a + i); | ||
int8x16_t vb1 = vld1q_s8(b + i); | ||
|
@@ -84,3 +91,17 @@ EXPORT int32_t sqr8s(int8_t *a, int8_t *b, size_t dims) { | |
int32x4_t acc6 = vaddq_s32(acc3, acc4); | ||
return vaddvq_s32(vaddq_s32(acc5, acc6)); | ||
} | ||
|
||
EXPORT int32_t sqr8s(int8_t* a, int8_t* b, size_t dims) { | ||
int32_t res = 0; | ||
int i = 0; | ||
if (i > SQR8S_STRIDE_BYTES_LEN) { | ||
i += dims & ~(SQR8S_STRIDE_BYTES_LEN - 1); | ||
res = sqr8s_inner(a, b, i); | ||
} | ||
for (; i < dims; i++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you can try and unroll this loop too? |
||
int32_t dist = a[i] - b[i]; | ||
res += dist * dist; | ||
} | ||
return res; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps it might be worth tweaking the comment a bit, i.e. accumulating into multiple registers gives around 50%, and unroll directive gives around 5%. I think otherwise it is ambiguous.