Skip to content

Commit

Permalink
Revert "Ensure Panama float vector distance impls inlinable (#14031)" (
Browse files Browse the repository at this point in the history
…#14041)

This reverts commit 4f08f3d.
  • Loading branch information
rmuir authored Dec 4, 2024
1 parent 6c48b40 commit c1362cc
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 50 deletions.
3 changes: 0 additions & 3 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,6 @@ Optimizations
* GITHUB#14032: Speed up PostingsEnum when positions are requested.
(Adrien Grand)

* GITHUB#14031: Ensure Panama float vector distance impls inlinable.
(Robert Muir, Chris Hegarty)

Bug Fixes
---------------------
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,6 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
}
}

// cached vector sizes for smaller method bodies
private static final int FLOAT_SPECIES_LENGTH = FLOAT_SPECIES.length();

// the way FMA should work! if available use it, otherwise fall back to mul/add
private static FloatVector fma(FloatVector a, FloatVector b, FloatVector c) {
if (Constants.HAS_FAST_VECTOR_FMA) {
Expand All @@ -102,7 +99,7 @@ public float dotProduct(float[] a, float[] b) {
float res = 0;

// if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
if (a.length > 2 * FLOAT_SPECIES_LENGTH) {
if (a.length > 2 * FLOAT_SPECIES.length()) {
i += FLOAT_SPECIES.loopBound(a.length);
res += dotProductBody(a, b, i);
}
Expand All @@ -123,33 +120,30 @@ private float dotProductBody(float[] a, float[] b, int limit) {
FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES);
FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES);
FloatVector acc4 = FloatVector.zero(FLOAT_SPECIES);
final int unrolledLimit = limit - 3 * FLOAT_SPECIES_LENGTH;
for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES_LENGTH) {
int unrolledLimit = limit - 3 * FLOAT_SPECIES.length();
for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES.length()) {
// one
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
acc1 = fma(va, vb, acc1);

// two
final int i2 = i + FLOAT_SPECIES_LENGTH;
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2);
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2);
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length());
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length());
acc2 = fma(vc, vd, acc2);

// three
final int i3 = i2 + FLOAT_SPECIES_LENGTH;
FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i3);
FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i3);
FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i + 2 * FLOAT_SPECIES.length());
FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i + 2 * FLOAT_SPECIES.length());
acc3 = fma(ve, vf, acc3);

// four
final int i4 = i3 + FLOAT_SPECIES_LENGTH;
FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i4);
FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i4);
FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i + 3 * FLOAT_SPECIES.length());
FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i + 3 * FLOAT_SPECIES.length());
acc4 = fma(vg, vh, acc4);
}
// vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
for (; i < limit; i += FLOAT_SPECIES_LENGTH) {
for (; i < limit; i += FLOAT_SPECIES.length()) {
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
acc1 = fma(va, vb, acc1);
Expand All @@ -168,7 +162,7 @@ public float cosine(float[] a, float[] b) {
float norm2 = 0;

// if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
if (a.length > 2 * FLOAT_SPECIES_LENGTH) {
if (a.length > 2 * FLOAT_SPECIES.length()) {
i += FLOAT_SPECIES.loopBound(a.length);
float[] ret = cosineBody(a, b, i);
sum += ret[0];
Expand Down Expand Up @@ -196,8 +190,8 @@ private float[] cosineBody(float[] a, float[] b, int limit) {
FloatVector norm1_2 = FloatVector.zero(FLOAT_SPECIES);
FloatVector norm2_1 = FloatVector.zero(FLOAT_SPECIES);
FloatVector norm2_2 = FloatVector.zero(FLOAT_SPECIES);
final int unrolledLimit = limit - FLOAT_SPECIES_LENGTH;
for (; i < unrolledLimit; i += 2 * FLOAT_SPECIES_LENGTH) {
int unrolledLimit = limit - FLOAT_SPECIES.length();
for (; i < unrolledLimit; i += 2 * FLOAT_SPECIES.length()) {
// one
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
Expand All @@ -206,15 +200,14 @@ private float[] cosineBody(float[] a, float[] b, int limit) {
norm2_1 = fma(vb, vb, norm2_1);

// two
final int i2 = i + FLOAT_SPECIES_LENGTH;
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2);
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2);
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length());
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length());
sum2 = fma(vc, vd, sum2);
norm1_2 = fma(vc, vc, norm1_2);
norm2_2 = fma(vd, vd, norm2_2);
}
// vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
for (; i < limit; i += FLOAT_SPECIES_LENGTH) {
for (; i < limit; i += FLOAT_SPECIES.length()) {
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
sum1 = fma(va, vb, sum1);
Expand All @@ -234,7 +227,7 @@ public float squareDistance(float[] a, float[] b) {
float res = 0;

// if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
if (a.length > 2 * FLOAT_SPECIES_LENGTH) {
if (a.length > 2 * FLOAT_SPECIES.length()) {
i += FLOAT_SPECIES.loopBound(a.length);
res += squareDistanceBody(a, b, i);
}
Expand All @@ -247,12 +240,6 @@ public float squareDistance(float[] a, float[] b) {
return res;
}

/** helper: returns fma(a.sub(b), a.sub(b), c) */
private static FloatVector square(FloatVector a, FloatVector b, FloatVector c) {
FloatVector diff = a.sub(b);
return fma(diff, diff, c);
}

/** vectorized square distance body */
private float squareDistanceBody(float[] a, float[] b, int limit) {
int i = 0;
Expand All @@ -262,36 +249,38 @@ private float squareDistanceBody(float[] a, float[] b, int limit) {
FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES);
FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES);
FloatVector acc4 = FloatVector.zero(FLOAT_SPECIES);
final int unrolledLimit = limit - 3 * FLOAT_SPECIES_LENGTH;
for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES_LENGTH) {
int unrolledLimit = limit - 3 * FLOAT_SPECIES.length();
for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES.length()) {
// one
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
acc1 = square(va, vb, acc1);
FloatVector diff1 = va.sub(vb);
acc1 = fma(diff1, diff1, acc1);

// two
final int i2 = i + FLOAT_SPECIES_LENGTH;
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2);
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2);
acc2 = square(vc, vd, acc2);
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length());
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length());
FloatVector diff2 = vc.sub(vd);
acc2 = fma(diff2, diff2, acc2);

// three
final int i3 = i2 + FLOAT_SPECIES_LENGTH;
FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i3);
FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i3);
acc3 = square(ve, vf, acc3);
FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i + 2 * FLOAT_SPECIES.length());
FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i + 2 * FLOAT_SPECIES.length());
FloatVector diff3 = ve.sub(vf);
acc3 = fma(diff3, diff3, acc3);

// four
final int i4 = i3 + FLOAT_SPECIES_LENGTH;
FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i4);
FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i4);
acc4 = square(vg, vh, acc4);
FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i + 3 * FLOAT_SPECIES.length());
FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i + 3 * FLOAT_SPECIES.length());
FloatVector diff4 = vg.sub(vh);
acc4 = fma(diff4, diff4, acc4);
}
// vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
for (; i < limit; i += FLOAT_SPECIES_LENGTH) {
for (; i < limit; i += FLOAT_SPECIES.length()) {
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
acc1 = square(va, vb, acc1);
FloatVector diff = va.sub(vb);
acc1 = fma(diff, diff, acc1);
}
// reduce
FloatVector res1 = acc1.add(acc2);
Expand Down

0 comments on commit c1362cc

Please sign in to comment.