Skip to content

Commit

Permalink
the-qa-companyGH-528 improve binary search in super blocks by keeping…
Browse files Browse the repository at this point in the history
… track of the estimated location of values in the underlying long array. This assumes that the values are ordered.
  • Loading branch information
hmottestad committed Dec 11, 2024
1 parent f5bb5ce commit 5a45912
Show file tree
Hide file tree
Showing 9 changed files with 306 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import com.the_qa_company.qendpoint.core.util.io.CloseSuppressPath;
import com.the_qa_company.qendpoint.core.util.io.Closer;
import com.the_qa_company.qendpoint.core.util.io.IOUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Closeable;
import java.io.IOException;
Expand All @@ -39,14 +41,31 @@
* @author mario.arias
*/
public class Bitmap375Big extends Bitmap64Big {

private static final Logger logger = LoggerFactory.getLogger(Bitmap375Big.class);

private static final boolean oldBinarySearch;

static {
// check if the system property "useOldBinarySeearch" is set to true
String useOldBinarySearch = System.getProperty("useOldBinarySearch");
if (useOldBinarySearch != null && useOldBinarySearch.equalsIgnoreCase("true")) {
oldBinarySearch = true;
logger.debug("Using old binary search");
} else {
logger.debug("Using new binary search");
oldBinarySearch = false;
}

}

/**
* create disk version bitmap with in memory super index
*
* @param location location
* @param nbits number of bits
* @return bitmap
*/

public static Bitmap375Big disk(Path location, long nbits) {
return disk(location, nbits, false);
}
Expand Down Expand Up @@ -181,6 +200,7 @@ public void updateIndex() {
}
pop = countSuperBlock + countBlock;
indexUpToDate = true;
superBlocks.recalculateEstimatedValueLocation();
}

/*
Expand All @@ -189,8 +209,9 @@ public void updateIndex() {
*/
@Override
public boolean access(long bitIndex) {
if (bitIndex < 0)
if (bitIndex < 0) {
throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
}

long wordIndex = wordIndex(bitIndex);
if (wordIndex >= words.length()) {
Expand Down Expand Up @@ -324,15 +345,14 @@ public long select1(long x) {
return 0;
}
// Search superblock (binary Search)
long superBlockIndex = binarySearch(superBlocks, x);
long superBlockIndex = oldBinarySearch ? binarySearch(superBlocks, x) : binarySearchNew(superBlocks, x);

// If there is a run of many zeros, two correlative superblocks may have
// the same value,
// We need to position at the first of them.

while (superBlockIndex > 0 && (superBlocks.get(superBlockIndex) >= x)) {
superBlockIndex--;

}

long countdown = x - superBlocks.get(superBlockIndex);
Expand Down Expand Up @@ -444,6 +464,7 @@ public static long binarySearch0(LongArray arr, long fromIndex, long toIndex, lo
* @param val val
* @return index
*/

public static long binarySearch(LongArray arr, long val) {
long min = 0, max = arr.length(), mid;

Expand All @@ -460,11 +481,52 @@ public static long binarySearch(LongArray arr, long val) {
return min;
}

public static long binarySearchNew(LongArray arr, long val) {

long min = arr.getEstimatedLocationLowerBound(val);
long max = arr.getEstimatedLocationUpperBound(val);
long mid = arr.getEstimatedLocation(val, min, max);

int i = 0;
while (min + 1 < max) {
// After the first iteration, the value that we are looking for is
// typically very close to the min value. Using linear search for
// the next two iterations improves the chances that we find the
// value faster than with binary search.
if (i == 1 || i == 2) {
long v = arr.get(min + 1);
if (v >= val) {
max = min + 1;
} else {
min = min + 1;
}
} else {
long v = arr.get(mid);
if (v >= val) {
max = mid;
} else {
min = mid;
}
}
mid = (min + max) / 2;
i++;
}

arr.updateEstimatedValueLocation(val, min);

return min;
}

public CloseSuppressPath getBlocksPath() {
return blocksPath;
}

public CloseSuppressPath getSuperBlocksPath() {
return superBlocksPath;
}

@Override
public String toString() {
return "Bitmap375Big{}";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package com.the_qa_company.qendpoint.core.util.disk;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class AbstractLongArray implements LongArray {

private final Logger logger = LoggerFactory.getLogger(getClass());

private final int ESTIMATED_LOCATION_ARRAY_SIZE = 1024 * 128;

// should take about 1MB per array when PREV_FOUND_SIZE is 1024 * 128
private final long[] estimatedLocationMax = new long[ESTIMATED_LOCATION_ARRAY_SIZE];
private final long[] estimatedLocationMin = new long[ESTIMATED_LOCATION_ARRAY_SIZE];
private final long[] estimatedLocation = new long[ESTIMATED_LOCATION_ARRAY_SIZE];

private int estimatedLocationBucketSize;

long maxValue = 1;

@Override
public int getEstimatedLocationArrayBucketSize() {
return estimatedLocationBucketSize;
}

private void updateEstimatedLocationArrayBucketSize() {
int minBucketSize = (int) (maxValue / ESTIMATED_LOCATION_ARRAY_SIZE);
// we want to have the next power of 2
int next = 1;
while (next < minBucketSize) {
next <<= 1;
}
this.estimatedLocationBucketSize = next;
}

@Override
public long[] getEstimatedLocationArray() {
return estimatedLocation;
}

@Override
public long[] getEstimatedLocationArrayMin() {
return estimatedLocationMin;
}

@Override
public long[] getEstimatedLocationArrayMax() {
return estimatedLocationMax;
}

@Override
public void recalculateEstimatedValueLocation() {
updateEstimatedLocationArrayBucketSize();
int estimatedLocationBucketSize = getEstimatedLocationArrayBucketSize();
long len = length();
boolean shouldLog = len > 1024 * 1024 * 2;
if (shouldLog) {
logger.info("Recalculating estimated location array 0%");
}

for (int i = 0; i < len; i++) {
long val = get(i);
if (val == 0) {
continue;
}

int index = (int) (val / estimatedLocationBucketSize + 1);
estimatedLocationMax[index] = Math.max(estimatedLocationMax[index], i);
if (estimatedLocationMin[index] == 0) {
estimatedLocationMin[index] = i;
} else {
estimatedLocationMin[index] = Math.min(estimatedLocationMin[index], i);
}
estimatedLocation[index] = (estimatedLocationMax[index] + estimatedLocationMin[index]) / 2;

if (shouldLog && i % (1024 * 1024) == 0) {
logger.info("Recalculating estimated location array {}%", (int) Math.floor(100.0 / len * i));
}
}

if (shouldLog) {
logger.info("Recalculating estimated location array 100%");
}
}

@Override
public final void set(long index, long value) {
maxValue = Math.max(maxValue, value);
innerSet(index, value);
}

abstract protected void innerSet(long index, long value);

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
*
* @author Antoine Willerval
*/
public class LargeLongArray implements LongArray {
public class LargeLongArray extends AbstractLongArray {
private UnsafeLongArray array;

/**
Expand All @@ -26,7 +26,7 @@ public long get(long index) {
}

@Override
public void set(long index, long value) {
protected void innerSet(long index, long value) {
array.set(index, value);
}

Expand Down Expand Up @@ -55,4 +55,5 @@ public void resize(long newSize) throws IOException {
public void clear() {
array.clear();
}

}
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package com.the_qa_company.qendpoint.core.util.disk;

import com.the_qa_company.qendpoint.core.util.io.IOUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Iterator;
Expand All @@ -10,6 +12,10 @@
* Describe a large array of longs
*/
public interface LongArray extends Iterable<Long> {

Logger logger = LoggerFactory.getLogger(LongArray.class);
long[] EMPTY_ARRAY = new long[0];

/**
* create an in memory long array
*
Expand Down Expand Up @@ -208,4 +214,84 @@ public Long next() {
}
};
}

/**
* @return the estimated location array that contains the highest location
* for a given value
*/
default long[] getEstimatedLocationArrayMax() {
return getEstimatedLocationArray();
}

/**
* @return the estimated location array that contains the lowest location
* for a given value
*/
default long[] getEstimatedLocationArrayMin() {
return getEstimatedLocationArray();
}

/**
* @return the estimated location array
*/
default long[] getEstimatedLocationArray() {
return EMPTY_ARRAY;
}

default int getEstimatedLocationArrayBucketSize() {
return 65536;
}

default long getEstimatedLocationLowerBound(long val) {
int index = (int) (val / getEstimatedLocationArrayBucketSize() + 1);
if (index - 1 >= 0) {
long t = getEstimatedLocationArrayMax()[index - 1];
if (t > 0) {
return t;
}
}
return 0;
}

default long getEstimatedLocationUpperBound(long val) {
int index = (int) (val / getEstimatedLocationArrayBucketSize() + 1);
long[] estimatedLocationMin = getEstimatedLocationArrayMin();
if (index + 1 < estimatedLocationMin.length) {
long t = estimatedLocationMin[index + 1];
if (t > 0) {
return Math.min(length(), t);
}
}

return length();
}

default long getEstimatedLocation(long val, long min, long max) {
int index = (int) (val / getEstimatedLocationArrayBucketSize() + 1);
var estimatedLocation = getEstimatedLocationArray();

if (index >= estimatedLocation.length) {
return (min + max) / 2;
}
long t = estimatedLocation[index];
if (t > min && t < max) {
return t;
} else {
return (min + max) / 2;
}
}

default void recalculateEstimatedValueLocation() {
logger.info("Class {} does not support recalculateEstimatedValueLocation()",
this.getClass().getCanonicalName());
}

default void updateEstimatedValueLocation(long val, long min) {
int index = (int) (val / getEstimatedLocationArrayBucketSize() + 1);
long[] estimatedLocation = getEstimatedLocationArray();
if (index >= estimatedLocation.length) {
return;
}
estimatedLocation[index] = min;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

//Implementing an array of longs that is backed up on disk. Following this: http://vanillajava.blogspot.fr/2011/12/using-memory-mapped-file-for-huge.html

public class LongArrayDisk implements Closeable, LongArray {
public class LongArrayDisk extends AbstractLongArray implements Closeable {
private static final long MAPPING_SIZE = 1 << 30;
private final boolean closeChannel;
private final FileChannel channel;
Expand Down Expand Up @@ -137,7 +137,7 @@ public long get(long x) {
}

@Override
public void set(long index, long value) {
protected void innerSet(long index, long value) {
if (index >= size || index < 0) {
throw new IndexOutOfBoundsException();
}
Expand Down
Loading

0 comments on commit 5a45912

Please sign in to comment.