elastic · ChrisHegarty · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024
@@ -18,7 +18,7 @@ configurations {
 }
 
 var zstdVersion = "1.5.5"
-var vecVersion = "1.0.1"
+var vecVersion = "1.0.2"
 
 repositories {
   exclusiveContent {

@@ -17,7 +17,6 @@
 import java.lang.invoke.MethodType;
 
 import static java.lang.foreign.ValueLayout.ADDRESS;
-import static java.lang.foreign.ValueLayout.JAVA_BYTE;
 import static java.lang.foreign.ValueLayout.JAVA_INT;
 import static org.elasticsearch.nativeaccess.jdk.LinkerHelper.downcallHandle;
 
@@ -29,24 +28,9 @@ public final class JdkVectorLibrary implements VectorLibrary {
 
     public JdkVectorLibrary() {}
 
-    static final MethodHandle dot8stride$mh = downcallHandle("dot8s_stride", FunctionDescriptor.of(JAVA_INT));
-    static final MethodHandle sqr8stride$mh = downcallHandle("sqr8s_stride", FunctionDescriptor.of(JAVA_INT));
-
     static final MethodHandle dot8s$mh = downcallHandle("dot8s", FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, JAVA_INT));
     static final MethodHandle sqr8s$mh = downcallHandle("sqr8s", FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, JAVA_INT));
 
-    // Stride of the native implementation - consumes this number of bytes per loop invocation.
-    // There must be at least this number of bytes/elements available when going native
-    static final int DOT_STRIDE = 32;
-    static final int SQR_STRIDE = 16;
-
-    static {
-        assert DOT_STRIDE > 0 && (DOT_STRIDE & (DOT_STRIDE - 1)) == 0 : "Not a power of two";
-        assert dot8Stride() == DOT_STRIDE : dot8Stride() + " != " + DOT_STRIDE;
-        assert SQR_STRIDE > 0 && (SQR_STRIDE & (SQR_STRIDE - 1)) == 0 : "Not a power of two";
-        assert sqr8Stride() == SQR_STRIDE : sqr8Stride() + " != " + SQR_STRIDE;
-    }
-
     /**
      * Computes the dot product of given byte vectors.
      * @param a address of the first vector
@@ -61,19 +45,7 @@ static int dotProduct(MemorySegment a, MemorySegment b, int length) {
         if (length > a.byteSize()) {
             throw new IllegalArgumentException("length: " + length + ", greater than vector dimensions: " + a.byteSize());
         }
-        int i = 0;
-        int res = 0;
-        if (length >= DOT_STRIDE) {
-            i += length & ~(DOT_STRIDE - 1);
-            res = dot8s(a, b, i);
-        }
-
-        // tail
-        for (; i < length; i++) {
-            res += a.get(JAVA_BYTE, i) * b.get(JAVA_BYTE, i);
-        }
-        assert i == length;
-        return res;
+        return dot8s(a, b, length);
     }
 
     /**
@@ -90,36 +62,7 @@ static int squareDistance(MemorySegment a, MemorySegment b, int length) {
         if (length > a.byteSize()) {
             throw new IllegalArgumentException("length: " + length + ", greater than vector dimensions: " + a.byteSize());
         }
-        int i = 0;
-        int res = 0;
-        if (length >= SQR_STRIDE) {
-            i += length & ~(SQR_STRIDE - 1);
-            res = sqr8s(a, b, i);
-        }
-
-        // tail
-        for (; i < length; i++) {
-            int dist = a.get(JAVA_BYTE, i) - b.get(JAVA_BYTE, i);
-            res += dist * dist;
-        }
-        assert i == length;
-        return res;
-    }
-
-    private static int dot8Stride() {
-        try {
-            return (int) dot8stride$mh.invokeExact();
-        } catch (Throwable t) {
-            throw new AssertionError(t);
-        }
-    }
-
-    private static int sqr8Stride() {
-        try {
-            return (int) sqr8stride$mh.invokeExact();
-        } catch (Throwable t) {
-            throw new AssertionError(t);
-        }
+        return sqr8s(a, b, length);
     }
 
     private static int dot8s(MemorySegment a, MemorySegment b, int length) {

diff --git a/libs/vec/native/build.gradle b/libs/vec/native/build.gradle
@@ -9,13 +9,19 @@ apply plugin: 'c'
 
 var os = org.gradle.internal.os.OperatingSystem.current()
 
-// To update this library run publish_vec_binaries.sh
+// To update this library run publish_vec_binaries.sh  ( or ./gradlew vecSharedLibrary )
 // Or
 // For local development, build the docker image with:
 //   docker build --platform linux/arm64 --progress=plain .
 // Grab the image id from the console output, then, e.g.
 //   docker run 9c9f36564c148b275aeecc42749e7b4580ded79dcf51ff6ccc008c8861e7a979 > build/libs/vec/shared/libvec.so
 //
+// To run tests and benchmarks on a locally built libvec,
+//  1. Temporarily comment out the download in libs/native/library/build.gradle
+//       libs "org.elasticsearch:vec:${vecVersion}@zip"
+//  2. Copy your locally built libvec binary, e.g.
+//       cp libs/vec/native/build/libs/vec/shared/libvec.dylib libs/native/libraries/build/platform/darwin-aarch64/libvec.dylib
+//
 // Look at the disassemble:
 //  objdump --disassemble-symbols=_dot8s build/libs/vec/shared/libvec.dylib
 // Note: symbol decoration may differ on Linux, i.e. the leading underscore is not present

diff --git a/libs/vec/native/publish_vec_binaries.sh b/libs/vec/native/publish_vec_binaries.sh
@@ -19,7 +19,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then
   exit 1;
 fi
 
-VERSION="1.0.1"
+VERSION="1.0.2"
 ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}"
 TEMP=$(mktemp -d)
 

diff --git a/libs/vec/native/src/vec/c/vec.c b/libs/vec/native/src/vec/c/vec.c
@@ -18,15 +18,7 @@
 #define SQR8S_STRIDE_BYTES_LEN 16
 #endif
 
-EXPORT int dot8s_stride() {
-    return DOT8_STRIDE_BYTES_LEN;
-}
-
-EXPORT int sqr8s_stride() {
-    return SQR8S_STRIDE_BYTES_LEN;
-}
-
-EXPORT int32_t dot8s(int8_t* a, int8_t* b, size_t dims) {
+int32_t dot8s_inner(int8_t* a, int8_t* b, size_t dims) {
     // We have contention in the instruction pipeline on the accumulation
     // registers if we use too few.
     int32x4_t acc1 = vdupq_n_s32(0);
@@ -35,6 +27,7 @@ EXPORT int32_t dot8s(int8_t* a, int8_t* b, size_t dims) {
     int32x4_t acc4 = vdupq_n_s32(0);
 
     // Some unrolling gives around 50% performance improvement.
+    #pragma clang loop unroll_count(2)
     for (int i = 0; i < dims; i += DOT8_STRIDE_BYTES_LEN) {
         // Read into 16 x 8 bit vectors.
         int8x16_t va1 = vld1q_s8(a + i);
@@ -60,12 +53,26 @@ EXPORT int32_t dot8s(int8_t* a, int8_t* b, size_t dims) {
     return vaddvq_s32(vaddq_s32(acc5, acc6));
 }
 
-EXPORT int32_t sqr8s(int8_t *a, int8_t *b, size_t dims) {
+EXPORT int32_t dot8s(int8_t* a, int8_t* b, size_t dims) {
+    int32_t res = 0;
+    int i = 0;
+    if (dims > DOT8_STRIDE_BYTES_LEN) {
+        i += dims & ~(DOT8_STRIDE_BYTES_LEN - 1);
+        res = dot8s_inner(a, b, i);
+    }
+    for (; i < dims; i++) {
+        res += a[i] * b[i];
+    }
+    return res;
+}
+
+int32_t sqr8s_inner(int8_t *a, int8_t *b, size_t dims) {
     int32x4_t acc1 = vdupq_n_s32(0);
     int32x4_t acc2 = vdupq_n_s32(0);
     int32x4_t acc3 = vdupq_n_s32(0);
     int32x4_t acc4 = vdupq_n_s32(0);
 
+    #pragma clang loop unroll_count(2)
     for (int i = 0; i < dims; i += SQR8S_STRIDE_BYTES_LEN) {
         int8x16_t va1 = vld1q_s8(a + i);
         int8x16_t vb1 = vld1q_s8(b + i);
@@ -84,3 +91,17 @@ EXPORT int32_t sqr8s(int8_t *a, int8_t *b, size_t dims) {
     int32x4_t acc6 = vaddq_s32(acc3, acc4);
     return vaddvq_s32(vaddq_s32(acc5, acc6));
 }
+
+EXPORT int32_t sqr8s(int8_t* a, int8_t* b, size_t dims) {
+    int32_t res = 0;
+    int i = 0;
+    if (i > SQR8S_STRIDE_BYTES_LEN) {
+        i += dims & ~(SQR8S_STRIDE_BYTES_LEN - 1);
+        res = sqr8s_inner(a, b, i);
+    }
+    for (; i < dims; i++) {
+        int32_t dist = a[i] - b[i];
+        res += dist * dist;
+    }
+    return res;
+}
diff --git a/libs/vec/native/src/vec/headers/vec.h b/libs/vec/native/src/vec/headers/vec.h
@@ -8,10 +8,6 @@
 
 #define EXPORT __attribute__((externally_visible,visibility("default")))
 
-EXPORT int dot8s_stride();
-
-EXPORT int sqr8s_stride();
-
 EXPORT int32_t dot8s(int8_t* a, int8_t* b, size_t dims);
 
 EXPORT int32_t sqr8s(int8_t *a, int8_t *b, size_t length);