Skip to content

Commit b486d90

Browse files
authored
Add low-level optimized Neon, AVX2, and AVX 512 float32 vector operations (#130635)
This commit adds low-level optimized Neon, AVX2, and AVX 512 float32 vector operations; cosine, dot product, and square distance. The changes in this PR give approximately 2x performance increase for float32 vector operations across Linux/ Mac AArch64 and Linux x64 (both AVX2 and AVX 512). The performance increase comes mostly from being able to score the vectors off-heap (rather than copying on-heap before scoring). The low-level native scorer implementations show only approx ~3-5% improvement over the existing Panama Vector implementation. However, the native scorers allow to score off-heap. The use of Panama Vector with MemorySegments runs into a performance bug in Hotspot, where the bound is not optimally hoisted out of the hot loop (has been reported and acknowledged by OpenJDK) . This vector ops will be used by higher-level vector scorers in #130541
1 parent 8cb5b89 commit b486d90

File tree

12 files changed

+1225
-4
lines changed

12 files changed

+1225
-4
lines changed
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
package org.elasticsearch.benchmark.vector;
10+
11+
import org.apache.lucene.util.VectorUtil;
12+
import org.elasticsearch.common.logging.LogConfigurator;
13+
import org.elasticsearch.common.logging.NodeNamePatternConverter;
14+
import org.elasticsearch.nativeaccess.NativeAccess;
15+
import org.elasticsearch.nativeaccess.VectorSimilarityFunctions;
16+
import org.openjdk.jmh.annotations.Benchmark;
17+
import org.openjdk.jmh.annotations.BenchmarkMode;
18+
import org.openjdk.jmh.annotations.Fork;
19+
import org.openjdk.jmh.annotations.Level;
20+
import org.openjdk.jmh.annotations.Measurement;
21+
import org.openjdk.jmh.annotations.Mode;
22+
import org.openjdk.jmh.annotations.OutputTimeUnit;
23+
import org.openjdk.jmh.annotations.Param;
24+
import org.openjdk.jmh.annotations.Scope;
25+
import org.openjdk.jmh.annotations.Setup;
26+
import org.openjdk.jmh.annotations.State;
27+
import org.openjdk.jmh.annotations.TearDown;
28+
import org.openjdk.jmh.annotations.Warmup;
29+
30+
import java.lang.foreign.Arena;
31+
import java.lang.foreign.MemorySegment;
32+
import java.lang.foreign.ValueLayout;
33+
import java.nio.ByteOrder;
34+
import java.util.concurrent.ThreadLocalRandom;
35+
import java.util.concurrent.TimeUnit;
36+
37+
@BenchmarkMode(Mode.AverageTime)
38+
@OutputTimeUnit(TimeUnit.NANOSECONDS)
39+
@State(Scope.Benchmark)
40+
@Warmup(iterations = 3, time = 1)
41+
@Measurement(iterations = 5, time = 1)
42+
public class JDKVectorFloat32Benchmark {
43+
44+
static {
45+
NodeNamePatternConverter.setGlobalNodeName("foo");
46+
LogConfigurator.loadLog4jPlugins();
47+
LogConfigurator.configureESLogging(); // native access requires logging to be initialized
48+
}
49+
50+
static final ValueLayout.OfFloat LAYOUT_LE_FLOAT = ValueLayout.JAVA_FLOAT_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN);
51+
52+
float[] floatsA;
53+
float[] floatsB;
54+
float[] scratch;
55+
MemorySegment heapSegA, heapSegB;
56+
MemorySegment nativeSegA, nativeSegB;
57+
58+
Arena arena;
59+
60+
@Param({ "1", "128", "207", "256", "300", "512", "702", "1024", "1536", "2048" })
61+
public int size;
62+
63+
@Setup(Level.Iteration)
64+
public void init() {
65+
ThreadLocalRandom random = ThreadLocalRandom.current();
66+
67+
floatsA = new float[size];
68+
floatsB = new float[size];
69+
scratch = new float[size];
70+
for (int i = 0; i < size; ++i) {
71+
floatsA[i] = random.nextFloat();
72+
floatsB[i] = random.nextFloat();
73+
}
74+
heapSegA = MemorySegment.ofArray(floatsA);
75+
heapSegB = MemorySegment.ofArray(floatsB);
76+
77+
arena = Arena.ofConfined();
78+
nativeSegA = arena.allocate((long) floatsA.length * Float.BYTES);
79+
MemorySegment.copy(MemorySegment.ofArray(floatsA), LAYOUT_LE_FLOAT, 0L, nativeSegA, LAYOUT_LE_FLOAT, 0L, floatsA.length);
80+
nativeSegB = arena.allocate((long) floatsB.length * Float.BYTES);
81+
MemorySegment.copy(MemorySegment.ofArray(floatsB), LAYOUT_LE_FLOAT, 0L, nativeSegB, LAYOUT_LE_FLOAT, 0L, floatsB.length);
82+
}
83+
84+
@TearDown
85+
public void teardown() {
86+
arena.close();
87+
}
88+
89+
// -- cosine
90+
91+
@Benchmark
92+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
93+
public float cosineLucene() {
94+
return VectorUtil.cosine(floatsA, floatsB);
95+
}
96+
97+
@Benchmark
98+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
99+
public float cosineLuceneWithCopy() {
100+
// add a copy to better reflect what Lucene has to do to get the target vector on-heap
101+
MemorySegment.copy(nativeSegB, LAYOUT_LE_FLOAT, 0L, scratch, 0, scratch.length);
102+
return VectorUtil.cosine(floatsA, scratch);
103+
}
104+
105+
@Benchmark
106+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
107+
public float cosineNativeWithNativeSeg() {
108+
return cosineFloat32(nativeSegA, nativeSegB, size);
109+
}
110+
111+
@Benchmark
112+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
113+
public float cosineNativeWithHeapSeg() {
114+
return cosineFloat32(heapSegA, heapSegB, size);
115+
}
116+
117+
// -- dot product
118+
119+
@Benchmark
120+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
121+
public float dotProductLucene() {
122+
return VectorUtil.dotProduct(floatsA, floatsB);
123+
}
124+
125+
@Benchmark
126+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
127+
public float dotProductLuceneWithCopy() {
128+
// add a copy to better reflect what Lucene has to do to get the target vector on-heap
129+
MemorySegment.copy(nativeSegB, LAYOUT_LE_FLOAT, 0L, scratch, 0, scratch.length);
130+
return VectorUtil.dotProduct(floatsA, scratch);
131+
}
132+
133+
@Benchmark
134+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
135+
public float dotProductNativeWithNativeSeg() {
136+
return dotProductFloat32(nativeSegA, nativeSegB, size);
137+
}
138+
139+
@Benchmark
140+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
141+
public float dotProductNativeWithHeapSeg() {
142+
return dotProductFloat32(heapSegA, heapSegB, size);
143+
}
144+
145+
// -- square distance
146+
147+
@Benchmark
148+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
149+
public float squareDistanceLucene() {
150+
return VectorUtil.squareDistance(floatsA, floatsB);
151+
}
152+
153+
@Benchmark
154+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
155+
public float squareDistanceLuceneWithCopy() {
156+
// add a copy to better reflect what Lucene has to do to get the target vector on-heap
157+
MemorySegment.copy(nativeSegB, LAYOUT_LE_FLOAT, 0L, scratch, 0, scratch.length);
158+
return VectorUtil.squareDistance(floatsA, scratch);
159+
}
160+
161+
@Benchmark
162+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
163+
public float squareDistanceNativeWithNativeSeg() {
164+
return squareDistanceFloat32(nativeSegA, nativeSegB, size);
165+
}
166+
167+
@Benchmark
168+
@Fork(value = 3, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
169+
public float squareDistanceNativeWithHeapSeg() {
170+
return squareDistanceFloat32(heapSegA, heapSegB, size);
171+
}
172+
173+
static final VectorSimilarityFunctions vectorSimilarityFunctions = vectorSimilarityFunctions();
174+
175+
static VectorSimilarityFunctions vectorSimilarityFunctions() {
176+
return NativeAccess.instance().getVectorSimilarityFunctions().get();
177+
}
178+
179+
float cosineFloat32(MemorySegment a, MemorySegment b, int length) {
180+
try {
181+
return (float) vectorSimilarityFunctions.cosineHandleFloat32().invokeExact(a, b, length);
182+
} catch (Throwable e) {
183+
if (e instanceof Error err) {
184+
throw err;
185+
} else if (e instanceof RuntimeException re) {
186+
throw re;
187+
} else {
188+
throw new RuntimeException(e);
189+
}
190+
}
191+
}
192+
193+
float dotProductFloat32(MemorySegment a, MemorySegment b, int length) {
194+
try {
195+
return (float) vectorSimilarityFunctions.dotProductHandleFloat32().invokeExact(a, b, length);
196+
} catch (Throwable e) {
197+
if (e instanceof Error err) {
198+
throw err;
199+
} else if (e instanceof RuntimeException re) {
200+
throw re;
201+
} else {
202+
throw new RuntimeException(e);
203+
}
204+
}
205+
}
206+
207+
float squareDistanceFloat32(MemorySegment a, MemorySegment b, int length) {
208+
try {
209+
return (float) vectorSimilarityFunctions.squareDistanceHandleFloat32().invokeExact(a, b, length);
210+
} catch (Throwable e) {
211+
if (e instanceof Error err) {
212+
throw err;
213+
} else if (e instanceof RuntimeException re) {
214+
throw re;
215+
} else {
216+
throw new RuntimeException(e);
217+
}
218+
}
219+
}
220+
}
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.benchmark.vector;
11+
12+
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
13+
14+
import org.apache.lucene.util.Constants;
15+
import org.elasticsearch.test.ESTestCase;
16+
import org.junit.BeforeClass;
17+
import org.openjdk.jmh.annotations.Param;
18+
19+
import java.util.Arrays;
20+
21+
public class JDKVectorFloat32BenchmarkTests extends ESTestCase {
22+
23+
final double delta;
24+
final int size;
25+
26+
public JDKVectorFloat32BenchmarkTests(int size) {
27+
this.size = size;
28+
delta = 1e-3 * size;
29+
}
30+
31+
@BeforeClass
32+
public static void skipWindows() {
33+
assumeFalse("doesn't work on windows yet", Constants.WINDOWS);
34+
}
35+
36+
static boolean supportsHeapSegments() {
37+
return Runtime.version().feature() >= 22;
38+
}
39+
40+
public void testCosine() {
41+
for (int i = 0; i < 100; i++) {
42+
var bench = new JDKVectorFloat32Benchmark();
43+
bench.size = size;
44+
bench.init();
45+
try {
46+
float expected = cosineFloat32Scalar(bench.floatsA, bench.floatsB);
47+
assertEquals(expected, bench.cosineLucene(), delta);
48+
assertEquals(expected, bench.cosineLuceneWithCopy(), delta);
49+
assertEquals(expected, bench.cosineNativeWithNativeSeg(), delta);
50+
if (supportsHeapSegments()) {
51+
assertEquals(expected, bench.cosineNativeWithHeapSeg(), delta);
52+
}
53+
} finally {
54+
bench.teardown();
55+
}
56+
}
57+
}
58+
59+
public void testDotProduct() {
60+
for (int i = 0; i < 100; i++) {
61+
var bench = new JDKVectorFloat32Benchmark();
62+
bench.size = size;
63+
bench.init();
64+
try {
65+
float expected = dotProductFloat32Scalar(bench.floatsA, bench.floatsB);
66+
assertEquals(expected, bench.dotProductLucene(), delta);
67+
assertEquals(expected, bench.dotProductLuceneWithCopy(), delta);
68+
assertEquals(expected, bench.dotProductNativeWithNativeSeg(), delta);
69+
if (supportsHeapSegments()) {
70+
assertEquals(expected, bench.dotProductNativeWithHeapSeg(), delta);
71+
}
72+
} finally {
73+
bench.teardown();
74+
}
75+
}
76+
}
77+
78+
public void testSquareDistance() {
79+
for (int i = 0; i < 100; i++) {
80+
var bench = new JDKVectorFloat32Benchmark();
81+
bench.size = size;
82+
bench.init();
83+
try {
84+
float expected = squareDistanceFloat32Scalar(bench.floatsA, bench.floatsB);
85+
assertEquals(expected, bench.squareDistanceLucene(), delta);
86+
assertEquals(expected, bench.squareDistanceLuceneWithCopy(), delta);
87+
assertEquals(expected, bench.squareDistanceNativeWithNativeSeg(), delta);
88+
if (supportsHeapSegments()) {
89+
assertEquals(expected, bench.squareDistanceNativeWithHeapSeg(), delta);
90+
}
91+
} finally {
92+
bench.teardown();
93+
}
94+
}
95+
}
96+
97+
@ParametersFactory
98+
public static Iterable<Object[]> parametersFactory() {
99+
try {
100+
var params = JDKVectorFloat32Benchmark.class.getField("size").getAnnotationsByType(Param.class)[0].value();
101+
return () -> Arrays.stream(params).map(Integer::parseInt).map(i -> new Object[] { i }).iterator();
102+
} catch (NoSuchFieldException e) {
103+
throw new AssertionError(e);
104+
}
105+
}
106+
107+
/** Computes the cosine of the given vectors a and b. */
108+
static float cosineFloat32Scalar(float[] a, float[] b) {
109+
float dot = 0, normA = 0, normB = 0;
110+
for (int i = 0; i < a.length; i++) {
111+
dot += a[i] * b[i];
112+
normA += a[i] * a[i];
113+
normB += b[i] * b[i];
114+
}
115+
double normAA = Math.sqrt(normA);
116+
double normBB = Math.sqrt(normB);
117+
if (normAA == 0.0f || normBB == 0.0f) return 0.0f;
118+
return (float) (dot / (normAA * normBB));
119+
}
120+
121+
/** Computes the dot product of the given vectors a and b. */
122+
static float dotProductFloat32Scalar(float[] a, float[] b) {
123+
float res = 0;
124+
for (int i = 0; i < a.length; i++) {
125+
res += a[i] * b[i];
126+
}
127+
return res;
128+
}
129+
130+
/** Computes the dot product of the given vectors a and b. */
131+
static float squareDistanceFloat32Scalar(float[] a, float[] b) {
132+
float squareSum = 0;
133+
for (int i = 0; i < a.length; i++) {
134+
float diff = a[i] - b[i];
135+
squareSum += diff * diff;
136+
}
137+
return squareSum;
138+
}
139+
}

docs/changelog/130635.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 130635
2+
summary: "Add low-level optimized Neon, AVX2, and AVX 512 float32 vector operations"
3+
area: Vector Search
4+
type: enhancement
5+
issues: []

libs/native/libraries/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ configurations {
1919
}
2020

2121
var zstdVersion = "1.5.5"
22-
var vecVersion = "1.0.11"
22+
var vecVersion = "1.0.13"
2323

2424
repositories {
2525
exclusiveContent {

0 commit comments

Comments
 (0)