From 47e28a8ff9de7d5348433d68a8b167beddbaa81f Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Mon, 29 Jan 2024 19:25:30 +0100
Subject: [PATCH] Vectorize boolean operators

Implement vectorized computation of AND, OR and NOT operators in WHERE
clause.
---
 .../nodes/decompress_chunk/compressed_batch.c | 427 +++++++++++-------
 .../nodes/decompress_chunk/compressed_batch.h |   2 +-
 tsl/src/nodes/decompress_chunk/planner.c      |  50 +-
 .../decompress_chunk/pred_vector_array.c      |  64 +--
 .../decompress_chunk/vector_predicates.h      |  40 ++
 tsl/test/expected/decompress_vector_qual.out  | 277 +++++++++++-
 tsl/test/sql/decompress_vector_qual.sql       |  75 ++-
 7 files changed, 707 insertions(+), 228 deletions(-)

diff --git a/tsl/src/nodes/decompress_chunk/compressed_batch.c b/tsl/src/nodes/decompress_chunk/compressed_batch.c
index 69315097d9a..061388fbde7 100644
--- a/tsl/src/nodes/decompress_chunk/compressed_batch.c
+++ b/tsl/src/nodes/decompress_chunk/compressed_batch.c
@@ -175,225 +175,305 @@ decompress_column(DecompressContext *dcontext, DecompressBatchState *batch_state
 	}
 }
 
-/*
- * Compute the vectorized filters. Returns true if we have any passing rows. If not,
- * it means the entire batch is filtered out, and we use this for further
- * optimizations.
- */
-static bool
-compute_vector_quals(DecompressContext *dcontext, DecompressBatchState *batch_state)
+static void
+compute_plain_qual(DecompressContext *dcontext, DecompressBatchState *batch_state, Node *qual,
+				   uint64 *restrict result)
 {
-	if (!dcontext->vectorized_quals_constified)
+	/*
+	 * For now, we support NullTest, "Var ? Const" predicates and
+	 * ScalarArrayOperations.
+	 */
+	List *args = NULL;
+	RegProcedure vector_const_opcode = InvalidOid;
+	ScalarArrayOpExpr *saop = NULL;
+	OpExpr *opexpr = NULL;
+	NullTest *nulltest = NULL;
+	if (IsA(qual, NullTest))
 	{
-		return true;
+		nulltest = castNode(NullTest, qual);
+		args = list_make1(nulltest->arg);
+	}
+	else if (IsA(qual, ScalarArrayOpExpr))
+	{
+		saop = castNode(ScalarArrayOpExpr, qual);
+		args = saop->args;
+		vector_const_opcode = get_opcode(saop->opno);
+	}
+	else
+	{
+		opexpr = castNode(OpExpr, qual);
+		args = opexpr->args;
+		vector_const_opcode = get_opcode(opexpr->opno);
 	}
 
 	/*
-	 * Allocate the bitmap that will hold the vectorized qual results. We will
-	 * initialize it to all ones and AND the individual quals to it.
+	 * Find the compressed column referred to by the Var.
 	 */
-	const int bitmap_bytes = sizeof(uint64) * (((uint64) batch_state->total_batch_rows + 63) / 64);
-	batch_state->vector_qual_result = palloc(bitmap_bytes);
-	memset(batch_state->vector_qual_result, 0xFF, bitmap_bytes);
-	if (batch_state->total_batch_rows % 64 != 0)
+	Var *var = castNode(Var, linitial(args));
+	CompressionColumnDescription *column_description = NULL;
+	int column_index = 0;
+	for (; column_index < dcontext->num_total_columns; column_index++)
+	{
+		column_description = &dcontext->template_columns[column_index];
+		if (column_description->output_attno == var->varattno)
+		{
+			break;
+		}
+	}
+	Ensure(column_index < dcontext->num_total_columns,
+		   "decompressed column %d not found in batch",
+		   var->varattno);
+	Assert(column_description != NULL);
+	Assert(column_description->typid == var->vartype);
+	Ensure(column_description->type == COMPRESSED_COLUMN,
+		   "only compressed columns are supported in vectorized quals");
+	Assert(column_index < dcontext->num_compressed_columns);
+
+	CompressedColumnValues *column_values = &batch_state->compressed_columns[column_index];
+
+	if (column_values->decompression_type == DT_Invalid)
 	{
 		/*
-		 * We have to zero out the bits for past-the-end elements in the last
-		 * bitmap word. Since all predicates are ANDed to the result bitmap,
-		 * we can do it here once instead of doing it in each predicate.
+		 * We decompress the compressed columns on demand, so that we can
+		 * skip decompressing some columns if the entire batch doesn't pass
+		 * the quals.
 		 */
-		const uint64 mask = ((uint64) -1) >> (64 - batch_state->total_batch_rows % 64);
-		batch_state->vector_qual_result[batch_state->total_batch_rows / 64] = mask;
+		decompress_column(dcontext, batch_state, column_index);
+		Assert(column_values->decompression_type != DT_Invalid);
 	}
 
+	Assert(column_values->decompression_type != DT_Iterator);
+
 	/*
-	 * Compute the quals.
+	 * Prepare to compute the vector predicate. We have to handle the
+	 * default values in a special way because they don't produce the usual
+	 * decompressed ArrowArrays.
 	 */
-	ListCell *lc;
-	foreach (lc, dcontext->vectorized_quals_constified)
+	uint64 default_value_predicate_result;
+	uint64 *predicate_result = result;
+	const ArrowArray *vector = column_values->arrow;
+	if (column_values->arrow == NULL)
 	{
 		/*
-		 * For now, we support NullTest, "Var ? Const" predicates and
-		 * ScalarArrayOperations.
+		 * The compressed column had a default value. We can't fall back to
+		 * the non-vectorized quals now, so build a single-value ArrowArray
+		 * with this default value, check if it passes the predicate, and apply
+		 * it to the entire batch.
 		 */
-		List *args = NULL;
-		RegProcedure vector_const_opcode = InvalidOid;
-		ScalarArrayOpExpr *saop = NULL;
-		OpExpr *opexpr = NULL;
-		NullTest *nulltest = NULL;
-		if (IsA(lfirst(lc), NullTest))
-		{
-			nulltest = castNode(NullTest, lfirst(lc));
-			args = list_make1(nulltest->arg);
-		}
-		else if (IsA(lfirst(lc), ScalarArrayOpExpr))
+		Assert(column_values->decompression_type == DT_Default);
+
+		/*
+		 * We saved the actual default value into the decompressed scan slot
+		 * above, so pull it from there.
+		 */
+		vector = make_single_value_arrow(column_description->typid,
+										 *column_values->output_value,
+										 *column_values->output_isnull);
+
+		/*
+		 * We start from an all-valid bitmap, because the predicate is
+		 * AND-ed to it.
+		 */
+		default_value_predicate_result = 1;
+		predicate_result = &default_value_predicate_result;
+	}
+
+	if (nulltest)
+	{
+		vector_nulltest(vector, nulltest->nulltesttype, predicate_result);
+	}
+	else
+	{
+		/*
+		 * Find the vector_const predicate.
+		 */
+		VectorPredicate *vector_const_predicate = get_vector_const_predicate(vector_const_opcode);
+		Assert(vector_const_predicate != NULL);
+
+		Ensure(IsA(lsecond(args), Const),
+			   "failed to evaluate runtime constant in vectorized filter");
+
+		/*
+		 * The vectorizable predicates should be STRICT, so we shouldn't see null
+		 * constants here.
+		 */
+		Const *constnode = castNode(Const, lsecond(args));
+		Ensure(!constnode->constisnull, "vectorized predicate called for a null value");
+
+		/*
+		 * At last, compute the predicate.
+		 */
+		if (saop)
 		{
-			saop = castNode(ScalarArrayOpExpr, lfirst(lc));
-			args = saop->args;
-			vector_const_opcode = get_opcode(saop->opno);
+			vector_array_predicate(vector_const_predicate,
+								   saop->useOr,
+								   vector,
+								   constnode->constvalue,
+								   predicate_result);
 		}
 		else
 		{
-			opexpr = castNode(OpExpr, lfirst(lc));
-			args = opexpr->args;
-			vector_const_opcode = get_opcode(opexpr->opno);
+			vector_const_predicate(vector, constnode->constvalue, predicate_result);
 		}
 
 		/*
-		 * Find the compressed column referred to by the Var.
+		 * Account for nulls which shouldn't pass the predicate. Note that the
+		 * vector here might have only one row, in contrast with the number of
+		 * rows in the batch, if the column has a default value in this batch.
 		 */
-		Var *var = castNode(Var, linitial(args));
-		CompressionColumnDescription *column_description = NULL;
-		int column_index = 0;
-		for (; column_index < dcontext->num_total_columns; column_index++)
+		const size_t n_vector_result_words = (vector->length + 63) / 64;
+		const uint64 *restrict validity = (uint64 *restrict) vector->buffers[0];
+		for (size_t i = 0; i < n_vector_result_words; i++)
 		{
-			column_description = &dcontext->template_columns[column_index];
-			if (column_description->output_attno == var->varattno)
-			{
-				break;
-			}
+			predicate_result[i] &= validity[i];
 		}
-		Ensure(column_index < dcontext->num_total_columns,
-			   "decompressed column %d not found in batch",
-			   var->varattno);
-		Assert(column_description != NULL);
-		Assert(column_description->typid == var->vartype);
-		Ensure(column_description->type == COMPRESSED_COLUMN,
-			   "only compressed columns are supported in vectorized quals");
-		Assert(column_index < dcontext->num_compressed_columns);
-
-		CompressedColumnValues *column_values = &batch_state->compressed_columns[column_index];
-
-		if (column_values->decompression_type == DT_Invalid)
+	}
+
+	/* Translate the result if the column had a default value. */
+	if (column_values->arrow == NULL)
+	{
+		Assert(column_values->decompression_type == DT_Default);
+		if (!(default_value_predicate_result & 1))
 		{
 			/*
-			 * We decompress the compressed columns on demand, so that we can
-			 * skip decompressing some columns if the entire batch doesn't pass
-			 * the quals.
+			 * We had a default value for the compressed column, and it
+			 * didn't pass the predicate, so the entire batch didn't pass.
 			 */
-			decompress_column(dcontext, batch_state, column_index);
-			Assert(column_values->decompression_type != DT_Invalid);
+			const size_t n_batch_result_words = (batch_state->total_batch_rows + 63) / 64;
+			for (size_t i = 0; i < n_batch_result_words; i++)
+			{
+				result[i] = 0;
+			}
 		}
+	}
+}
 
-		Assert(column_values->decompression_type != DT_Iterator);
+static void compute_one_qual(DecompressContext *dcontext, DecompressBatchState *batch_state,
+							 Node *qual, uint64 *restrict result);
 
-		/*
-		 * Prepare to compute the vector predicate. We have to handle the
-		 * default values in a special way because they don't produce the usual
-		 * decompressed ArrowArrays.
-		 */
-		uint64 default_value_predicate_result;
-		uint64 *predicate_result = batch_state->vector_qual_result;
-		const ArrowArray *vector = column_values->arrow;
-		if (column_values->arrow == NULL)
+static void
+compute_qual_conjunction(DecompressContext *dcontext, DecompressBatchState *batch_state,
+						 List *quals, uint64 *restrict result)
+{
+	ListCell *lc;
+	foreach (lc, quals)
+	{
+		compute_one_qual(dcontext, batch_state, lfirst(lc), result);
+		if (get_vector_qual_summary(result, batch_state->total_batch_rows) == NoRowsPass)
 		{
 			/*
-			 * The compressed column had a default value. We can't fall back to
-			 * the non-vectorized quals now, so build a single-value ArrowArray
-			 * with this default value, check if it passes the predicate, and apply
-			 * it to the entire batch.
+			 * Exit early if no rows pass already. This might allow us to avoid
+			 * reading the columns required for the subsequent quals.
 			 */
-			Assert(column_values->decompression_type == DT_Default);
+			return;
+		}
+	}
+}
 
-			/*
-			 * We saved the actual default value into the decompressed scan slot
-			 * above, so pull it from there.
-			 */
-			vector = make_single_value_arrow(column_description->typid,
-											 *column_values->output_value,
-											 *column_values->output_isnull);
+static void
+compute_qual_disjunction(DecompressContext *dcontext, DecompressBatchState *batch_state,
+						 List *quals, uint64 *restrict result)
+{
+	const size_t n_rows = batch_state->total_batch_rows;
+	const size_t n_result_words = (n_rows + 63) / 64;
+	uint64 *or_result = palloc(sizeof(uint64) * n_result_words);
+	for (size_t i = 0; i < n_result_words; i++)
+	{
+		or_result[i] = 0;
+	}
 
-			/*
-			 * We start from an all-valid bitmap, because the predicate is
-			 * AND-ed to it.
-			 */
-			default_value_predicate_result = 1;
-			predicate_result = &default_value_predicate_result;
-		}
+	uint64 *one_qual_result = palloc(sizeof(uint64) * n_result_words);
 
-		if (nulltest)
+	ListCell *lc;
+	foreach (lc, quals)
+	{
+		for (size_t i = 0; i < n_result_words; i++)
 		{
-			vector_nulltest(vector, nulltest->nulltesttype, predicate_result);
+			one_qual_result[i] = (uint64) -1;
 		}
-		else
+		compute_one_qual(dcontext, batch_state, lfirst(lc), one_qual_result);
+		for (size_t i = 0; i < n_result_words; i++)
 		{
-			/*
-			 * Find the vector_const predicate.
-			 */
-			VectorPredicate *vector_const_predicate =
-				get_vector_const_predicate(vector_const_opcode);
-			Assert(vector_const_predicate != NULL);
-
-			Ensure(IsA(lsecond(args), Const),
-				   "failed to evaluate runtime constant in vectorized filter");
+			or_result[i] |= one_qual_result[i];
+		}
 
+		if (get_vector_qual_summary(or_result, n_rows) == AllRowsPass)
+		{
 			/*
-			 * The vectorizable predicates should be STRICT, so we shouldn't see null
-			 * constants here.
+			 * We can sometimes avoing reading the columns required for the
+			 * rest of conditions if we break out early here.
 			 */
-			Const *constnode = castNode(Const, lsecond(args));
-			Ensure(!constnode->constisnull, "vectorized predicate called for a null value");
+			return;
+		}
+	}
 
-			/*
-			 * At last, compute the predicate.
-			 */
-			if (saop)
-			{
-				vector_array_predicate(vector_const_predicate,
-									   saop->useOr,
-									   vector,
-									   constnode->constvalue,
-									   predicate_result);
-			}
-			else
-			{
-				vector_const_predicate(vector, constnode->constvalue, predicate_result);
-			}
+	for (size_t i = 0; i < n_result_words; i++)
+	{
+		result[i] &= or_result[i];
+	}
+}
 
-			/* Account for nulls which shouldn't pass the predicate. */
-			const size_t n = vector->length;
-			const size_t n_words = (n + 63) / 64;
-			const uint64 *restrict validity = (uint64 *restrict) vector->buffers[0];
-			for (size_t i = 0; i < n_words; i++)
-			{
-				predicate_result[i] &= validity[i];
-			}
-		}
+static void
+compute_one_qual(DecompressContext *dcontext, DecompressBatchState *batch_state, Node *qual,
+				 uint64 *restrict result)
+{
+	if (!IsA(qual, BoolExpr))
+	{
+		compute_plain_qual(dcontext, batch_state, qual, result);
+		return;
+	}
 
-		/* Process the result. */
-		if (column_values->arrow == NULL)
-		{
-			/* The column had a default value. */
-			Assert(column_values->decompression_type == DT_Default);
+	BoolExpr *boolexpr = castNode(BoolExpr, qual);
+	if (boolexpr->boolop == AND_EXPR)
+	{
+		compute_qual_conjunction(dcontext, batch_state, boolexpr->args, result);
+		return;
+	}
 
-			if (!(default_value_predicate_result & 1))
-			{
-				/*
-				 * We had a default value for the compressed column, and it
-				 * didn't pass the predicate, so the entire batch didn't pass.
-				 */
-				for (int i = 0; i < bitmap_bytes / 8; i++)
-				{
-					batch_state->vector_qual_result[i] = 0;
-				}
-			}
-		}
+	/*
+	 * Postgres removes NOT for operators we can vectorize, so we don't support
+	 * NOT and consider it non-vectorizable at planning time. So only OR is left.
+	 */
+	Assert(boolexpr->boolop == OR_EXPR);
+	compute_qual_disjunction(dcontext, batch_state, boolexpr->args, result);
+}
 
+/*
+ * Compute the vectorized filters. Returns true if we have any passing rows. If not,
+ * it means the entire batch is filtered out, and we use this for further
+ * optimizations.
+ */
+static VectorQualSummary
+compute_vector_quals(DecompressContext *dcontext, DecompressBatchState *batch_state)
+{
+	/*
+	 * Allocate the bitmap that will hold the vectorized qual results. We will
+	 * initialize it to all ones and AND the individual quals to it.
+	 */
+	const size_t n_rows = batch_state->total_batch_rows;
+	const int bitmap_bytes = sizeof(uint64) * ((n_rows + 63) / 64);
+	batch_state->vector_qual_result = palloc(bitmap_bytes);
+	memset(batch_state->vector_qual_result, 0xFF, bitmap_bytes);
+	if (n_rows % 64 != 0)
+	{
 		/*
-		 * Have to return whether we have any passing rows.
+		 * We have to zero out the bits for past-the-end elements in the last
+		 * bitmap word. Since all predicates are ANDed to the result bitmap,
+		 * we can do it here once instead of doing it in each predicate.
 		 */
-		bool have_passing_rows = false;
-		for (int i = 0; i < bitmap_bytes / 8; i++)
-		{
-			have_passing_rows |= batch_state->vector_qual_result[i];
-		}
-		if (!have_passing_rows)
-		{
-			return false;
-		}
+		const uint64 mask = ((uint64) -1) >> (64 - batch_state->total_batch_rows % 64);
+		batch_state->vector_qual_result[batch_state->total_batch_rows / 64] = mask;
 	}
 
-	return true;
+	/*
+	 * Compute the quals.
+	 */
+	compute_qual_conjunction(dcontext,
+							 batch_state,
+							 dcontext->vectorized_quals_constified,
+							 batch_state->vector_qual_result);
+
+	return get_vector_qual_summary(batch_state->vector_qual_result, n_rows);
 }
 
 /*
@@ -526,8 +606,10 @@ compressed_batch_set_compressed_tuple(DecompressContext *dcontext,
 		}
 	}
 
-	const bool have_passing_rows = compute_vector_quals(dcontext, batch_state);
-	if (!have_passing_rows && !dcontext->batch_sorted_merge)
+	VectorQualSummary vector_qual_summary = dcontext->vectorized_quals_constified != NIL ?
+												compute_vector_quals(dcontext, batch_state) :
+												AllRowsPass;
+	if (vector_qual_summary == NoRowsPass && !dcontext->batch_sorted_merge)
 	{
 		/*
 		 * The entire batch doesn't pass the vectorized quals, so we might be
@@ -560,6 +642,15 @@ compressed_batch_set_compressed_tuple(DecompressContext *dcontext,
 				Assert(column_values->decompression_type != DT_Invalid);
 			}
 		}
+
+		/*
+		 * If all rows pass, no need to test the vector qual for each row. This
+		 * is a common case for time range conditions.
+		 */
+		if (vector_qual_summary == AllRowsPass)
+		{
+			batch_state->vector_qual_result = NULL;
+		}
 	}
 
 	MemoryContextSwitchTo(old_context);
diff --git a/tsl/src/nodes/decompress_chunk/compressed_batch.h b/tsl/src/nodes/decompress_chunk/compressed_batch.h
index a20f961b127..29d66826b21 100644
--- a/tsl/src/nodes/decompress_chunk/compressed_batch.h
+++ b/tsl/src/nodes/decompress_chunk/compressed_batch.h
@@ -69,7 +69,7 @@ typedef struct DecompressBatchState
 	 * row. Indexed same as arrow arrays, w/o accounting for the reverse scan
 	 * direction. Initialized to all ones, i.e. all rows pass.
 	 */
-	uint64 *vector_qual_result;
+	uint64 *restrict vector_qual_result;
 
 	CompressedColumnValues compressed_columns[FLEXIBLE_ARRAY_MEMBER];
 } DecompressBatchState;
diff --git a/tsl/src/nodes/decompress_chunk/planner.c b/tsl/src/nodes/decompress_chunk/planner.c
index ea609c241ce..26c77d7115d 100644
--- a/tsl/src/nodes/decompress_chunk/planner.c
+++ b/tsl/src/nodes/decompress_chunk/planner.c
@@ -430,8 +430,54 @@ static Node *
 make_vectorized_qual(DecompressChunkPath *path, Node *qual)
 {
 	/*
-	 * Currently we vectorize some "Var op Const" binary predicates,
-	 * scalar array operations with these predicates, and null test.
+	 * We can vectorize BoolExpr (AND/OR/NOT).
+	 */
+	if (IsA(qual, BoolExpr))
+	{
+		BoolExpr *boolexpr = castNode(BoolExpr, qual);
+
+		if (boolexpr->boolop == NOT_EXPR)
+		{
+			/*
+			 * NOT should be removed by Postgres for all operators we can
+			 * vectorize (see prepqual.c), so we don't support it.
+			 */
+			return NULL;
+		}
+
+		bool need_copy = false;
+		List *vectorized_args = NIL;
+		ListCell *lc;
+		foreach (lc, boolexpr->args)
+		{
+			Node *arg = lfirst(lc);
+			Node *vectorized_arg = make_vectorized_qual(path, arg);
+			if (vectorized_arg == NULL)
+			{
+				return NULL;
+			}
+
+			if (vectorized_arg != arg)
+			{
+				need_copy = true;
+			}
+
+			vectorized_args = lappend(vectorized_args, vectorized_arg);
+		}
+
+		if (!need_copy)
+		{
+			return (Node *) boolexpr;
+		}
+
+		BoolExpr *boolexpr_copy = (BoolExpr *) copyObject(boolexpr);
+		boolexpr_copy->args = vectorized_args;
+		return (Node *) boolexpr_copy;
+	}
+
+	/*
+	 * Among the simple predicates, we vectorize some "Var op Const" binary
+	 * predicates, scalar array operations with these predicates, and null test.
 	 */
 	NullTest *nulltest = NULL;
 	OpExpr *opexpr = NULL;
diff --git a/tsl/src/nodes/decompress_chunk/pred_vector_array.c b/tsl/src/nodes/decompress_chunk/pred_vector_array.c
index da154644d9b..dccdbe771d3 100644
--- a/tsl/src/nodes/decompress_chunk/pred_vector_array.c
+++ b/tsl/src/nodes/decompress_chunk/pred_vector_array.c
@@ -21,10 +21,10 @@ static inline void
 vector_array_predicate_impl(VectorPredicate *vector_const_predicate, bool is_or,
 							const ArrowArray *vector, Datum array, uint64 *restrict final_result)
 {
-	const size_t result_bits = vector->length;
-	const size_t result_words = (result_bits + 63) / 64;
+	const size_t n_rows = vector->length;
+	const size_t result_words = (n_rows + 63) / 64;
 
-	uint64 *restrict array_result = NULL;
+	uint64 *restrict array_result = final_result;
 	/*
 	 * For OR, we need an intermediate storage to accumulate the results
 	 * from all elements.
@@ -38,17 +38,6 @@ vector_array_predicate_impl(VectorPredicate *vector_const_predicate, bool is_or,
 		{
 			array_result_storage[i] = 0;
 		}
-
-		if (vector->length % 64 != 0)
-		{
-			/*
-			 * Set the bits for past-the-end elements to 1. This way it's more
-			 * convenient to check for early exit, and the final result should
-			 * have them already set to 0 so it doesn't matter.
-			 */
-			const uint64 mask = ((uint64) -1) << (vector->length % 64);
-			array_result[vector->length / 64] = mask;
-		}
 	}
 
 	ArrayType *arr = DatumGetArrayTypeP(array);
@@ -84,7 +73,7 @@ vector_array_predicate_impl(VectorPredicate *vector_const_predicate, bool is_or,
 
 			for (size_t word = 0; word < result_words; word++)
 			{
-				final_result[word] = 0;
+				array_result[word] = 0;
 			}
 			return;
 		}
@@ -111,7 +100,7 @@ vector_array_predicate_impl(VectorPredicate *vector_const_predicate, bool is_or,
 		}
 		else
 		{
-			single_result = final_result;
+			single_result = array_result;
 		}
 
 		vector_const_predicate(vector, constvalue, single_result);
@@ -125,45 +114,14 @@ vector_array_predicate_impl(VectorPredicate *vector_const_predicate, bool is_or,
 		}
 
 		/*
-		 * On big arrays, we want to sometimes check if we can exit early,
-		 * to avoid being slower than the non-vectorized version which exits
-		 * at first possibility. The frequency is chosen by benchmarking.
-		 * In debug mode, do this more frequently to simplify testing.
+		 * The bitmaps are small, no more than 15 qwords for our maximal
+		 * compressed batch size of 1000 rows, so we can check for early exit
+		 * after every row.
 		 */
-#ifdef NDEBUG
-		if (array_index > 0 && array_index % 16 == 0)
-#else
-		if (array_index > 0 && array_index % 3 == 0)
-#endif
+		VectorQualSummary summary = get_vector_qual_summary(array_result, n_rows);
+		if (summary == (is_or ? AllRowsPass : NoRowsPass))
 		{
-			if (is_or)
-			{
-				/*
-				 * Note that we have set the bits for past-the-end rows in
-				 * array_result to 1, so we can use simple AND here.
-				 */
-				uint64 all_rows_match = -1;
-				for (size_t word = 0; word < result_words; word++)
-				{
-					all_rows_match &= array_result[word];
-				}
-				if (all_rows_match == -1ULL)
-				{
-					return;
-				}
-			}
-			else
-			{
-				uint64 any_rows_match = 0;
-				for (size_t word = 0; word < result_words; word++)
-				{
-					any_rows_match |= final_result[word];
-				}
-				if (any_rows_match == 0)
-				{
-					return;
-				}
-			}
+			return;
 		}
 	}
 
diff --git a/tsl/src/nodes/decompress_chunk/vector_predicates.h b/tsl/src/nodes/decompress_chunk/vector_predicates.h
index 06a4b40435b..c8874efeef3 100644
--- a/tsl/src/nodes/decompress_chunk/vector_predicates.h
+++ b/tsl/src/nodes/decompress_chunk/vector_predicates.h
@@ -17,3 +17,43 @@ void vector_array_predicate(VectorPredicate *scalar_predicate, bool is_or, const
 							Datum array, uint64 *restrict result);
 
 void vector_nulltest(const ArrowArray *arrow, int test_type, uint64 *restrict result);
+
+typedef enum VectorQualSummary
+{
+	AllRowsPass,
+	NoRowsPass,
+	SomeRowsPass
+} VectorQualSummary;
+
+static pg_attribute_always_inline VectorQualSummary
+get_vector_qual_summary(uint64 *restrict qual_result, size_t n_rows)
+{
+	bool any_rows_pass = false;
+	bool all_rows_pass = true;
+	for (size_t i = 0; i < n_rows / 64; i++)
+	{
+		any_rows_pass |= (qual_result[i] != 0);
+		all_rows_pass &= (~qual_result[i] == 0);
+	}
+
+	if (n_rows % 64 != 0)
+	{
+		const uint64 last_word_mask = -1ULL >> (64 - n_rows % 64);
+		any_rows_pass |= (qual_result[n_rows / 64] & last_word_mask) != 0;
+		all_rows_pass &= ((~qual_result[n_rows / 64]) & last_word_mask) == 0;
+	}
+
+	Assert(!(all_rows_pass && !any_rows_pass));
+
+	if (!any_rows_pass)
+	{
+		return NoRowsPass;
+	}
+
+	if (all_rows_pass)
+	{
+		return AllRowsPass;
+	}
+
+	return SomeRowsPass;
+}
diff --git a/tsl/test/expected/decompress_vector_qual.out b/tsl/test/expected/decompress_vector_qual.out
index ec90fa1073d..e9f8965624b 100644
--- a/tsl/test/expected/decompress_vector_qual.out
+++ b/tsl/test/expected/decompress_vector_qual.out
@@ -245,8 +245,10 @@ select * from vectorqual where ts > '2021-01-01 00:00:00' and metric3 > 40 order
  Wed Jan 01 00:00:00 2025 |      52 |      5 |      53 |      54 | tag5
 (2 rows)
 
--- ORed constrainst on multiple columns (not vectorized for now).
-set timescaledb.debug_require_vector_qual to 'forbid';
+-- ORed constrainst on multiple columns.
+set timescaledb.debug_require_vector_qual to 'only';
+-- set timescaledb.debug_require_vector_qual to 'forbid';
+-- set timescaledb.enable_bulk_decompression to off;
 select * from vectorqual where ts > '2021-01-01 00:00:00' or metric3 > 40 order by vectorqual;
             ts            | metric2 | device | metric3 | metric4 | tag  
 --------------------------+---------+--------+---------+---------+------
@@ -257,7 +259,54 @@ select * from vectorqual where ts > '2021-01-01 00:00:00' or metric3 > 40 order
  Wed Jan 01 00:00:00 2025 |      52 |      5 |      53 |      54 | tag5
 (5 rows)
 
+-- Some more tests for boolean operations.
+select count(*) from vectorqual where ts > '2021-01-01 00:00:00';
+ count 
+-------
+     3
+(1 row)
+
+select count(*) from vectorqual where 40 < metric3;
+ count 
+-------
+     4
+(1 row)
+
+select count(*) from vectorqual where metric2 < 0;
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from vectorqual where ts > '2021-01-01 00:00:00' or 40 < metric3;
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from vectorqual where not (ts <= '2021-01-01 00:00:00' and 40 >= metric3);
+ count 
+-------
+     5
+(1 row)
+
+-- early exit inside AND BoolExpr
+select count(*) from vectorqual where metric2 < 0 or (metric4 < -1 and 40 >= metric3);
+ count 
+-------
+     0
+(1 row)
+
+-- early exit after OR BoolExpr
+select count(*) from vectorqual where metric2 < 0 or metric3  < -1;
+ count 
+-------
+     0
+(1 row)
+
+reset timescaledb.enable_bulk_decompression;
 -- Test with unary operator.
+set timescaledb.debug_require_vector_qual to 'forbid';
 create operator !! (function = 'bool', rightarg = int4);
 select count(*) from vectorqual where !!metric3;
  count 
@@ -265,6 +314,12 @@ select count(*) from vectorqual where !!metric3;
      5
 (1 row)
 
+select count(*) from vectorqual where not !!metric3;
+ count 
+-------
+     0
+(1 row)
+
 -- Custom operator on column that supports bulk decompression is not vectorized.
 set timescaledb.debug_require_vector_qual to 'forbid';
 create function int4eqq(int4, int4) returns bool as 'int4eq' language internal;
@@ -281,6 +336,18 @@ select count(*) from vectorqual where metric3 === any(array[777, 888]);
      2
 (1 row)
 
+select count(*) from vectorqual where not metric3 === 777;
+ count 
+-------
+     3
+(1 row)
+
+select count(*) from vectorqual where metric3 = 777 or metric3 === 777;
+ count 
+-------
+     2
+(1 row)
+
 -- It also doesn't have a commutator.
 select count(*) from vectorqual where 777 === metric3;
  count 
@@ -633,6 +700,212 @@ select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 12])
      0
 (1 row)
 
+-- Also check early exit for AND/OR. Top-level clause must be OR, because top-level
+-- AND is flattened into a list.
+select count(*) from singlebatch where (metric2 < 20 and metric2 < 30) or metric3 = 777;
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from singlebatch where (metric2 < 30 and metric2 < 20) or metric3 = 777;
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from singlebatch where metric3 = 777 or (metric2 < 20 and metric2 < 30);
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from singlebatch where metric3 = 777 or (metric2 < 30 and metric2 < 20);
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from vectorqual where (metric2 < 20 and metric2 < 30) or metric3 = 777;
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from vectorqual where (metric2 < 30 and metric2 < 20) or metric3 = 777;
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from vectorqual where metric3 = 777 or (metric2 < 20 and metric2 < 30);
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from vectorqual where metric3 = 777 or (metric2 < 30 and metric2 < 20);
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from singlebatch where metric2 < 20 or metric3 < 50 or metric3 > 50;
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from singlebatch where metric2 < 20 or metric3 > 50 or metric3 < 50;
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from singlebatch where metric3 < 50 or metric2 < 20 or metric3 > 50;
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from singlebatch where metric3 > 50 or metric3 < 50 or metric2 < 20;
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from vectorqual where metric2 < 20 or metric3 < 50 or metric3 > 50;
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from vectorqual where metric2 < 20 or metric3 > 50 or metric3 < 50;
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from vectorqual where metric3 < 50 or metric2 < 20 or metric3 > 50;
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from vectorqual where metric3 > 50 or metric3 < 50 or metric2 < 20;
+ count 
+-------
+     5
+(1 row)
+
+select count(*) from singlebatch where metric2 = 12 or metric3 = 888;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric2 = 22 or metric3 = 888;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric2 = 32 or metric3 = 888;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric2 = 42 or metric3 = 888;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where metric2 = 52 or metric3 = 888;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from vectorqual where metric2 = 12 or metric3 = 888;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from vectorqual where metric2 = 22 or metric3 = 888;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from vectorqual where metric2 = 32 or metric3 = 888;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from vectorqual where metric2 = 42 or metric3 = 888;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from vectorqual where metric2 = 52 or metric3 = 888;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where ts > '2024-01-01' or (metric3 = 777 and metric2 = 12);
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from singlebatch where ts > '2024-01-01' or (metric3 = 777 and metric2 = 666);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where ts > '2024-01-01' or (metric3 = 888 and metric2 = 12);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from singlebatch where ts > '2024-01-01' or (metric3 = 888 and metric2 = 666);
+ count 
+-------
+     0
+(1 row)
+
+select count(*) from vectorqual where ts > '2024-01-01' or (metric3 = 777 and metric2 = 12);
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from vectorqual where ts > '2024-01-01' or (metric3 = 777 and metric2 = 666);
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from vectorqual where ts > '2024-01-01' or (metric3 = 888 and metric2 = 12);
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from vectorqual where ts > '2024-01-01' or (metric3 = 888 and metric2 = 666);
+ count 
+-------
+     1
+(1 row)
+
 reset timescaledb.enable_bulk_decompression;
 reset timescaledb.debug_require_vector_qual;
 -- Comparison with other column not vectorized.
diff --git a/tsl/test/sql/decompress_vector_qual.sql b/tsl/test/sql/decompress_vector_qual.sql
index efffcd2ce8d..446ec7e2052 100644
--- a/tsl/test/sql/decompress_vector_qual.sql
+++ b/tsl/test/sql/decompress_vector_qual.sql
@@ -109,14 +109,38 @@ select metric4 from vectorqual where ts > '2021-01-01 00:00:00' order by 1;
 select * from vectorqual where ts > '2021-01-01 00:00:00' and metric3 > 40 order by vectorqual;
 
 
--- ORed constrainst on multiple columns (not vectorized for now).
-set timescaledb.debug_require_vector_qual to 'forbid';
+-- ORed constrainst on multiple columns.
+set timescaledb.debug_require_vector_qual to 'only';
+-- set timescaledb.debug_require_vector_qual to 'forbid';
+-- set timescaledb.enable_bulk_decompression to off;
+
 select * from vectorqual where ts > '2021-01-01 00:00:00' or metric3 > 40 order by vectorqual;
 
+-- Some more tests for boolean operations.
+select count(*) from vectorqual where ts > '2021-01-01 00:00:00';
+
+select count(*) from vectorqual where 40 < metric3;
+
+select count(*) from vectorqual where metric2 < 0;
+
+select count(*) from vectorqual where ts > '2021-01-01 00:00:00' or 40 < metric3;
+
+select count(*) from vectorqual where not (ts <= '2021-01-01 00:00:00' and 40 >= metric3);
+
+-- early exit inside AND BoolExpr
+select count(*) from vectorqual where metric2 < 0 or (metric4 < -1 and 40 >= metric3);
+
+-- early exit after OR BoolExpr
+select count(*) from vectorqual where metric2 < 0 or metric3  < -1;
+
+reset timescaledb.enable_bulk_decompression;
+
 
 -- Test with unary operator.
+set timescaledb.debug_require_vector_qual to 'forbid';
 create operator !! (function = 'bool', rightarg = int4);
 select count(*) from vectorqual where !!metric3;
+select count(*) from vectorqual where not !!metric3;
 
 
 -- Custom operator on column that supports bulk decompression is not vectorized.
@@ -125,6 +149,8 @@ create function int4eqq(int4, int4) returns bool as 'int4eq' language internal;
 create operator === (function = 'int4eqq', rightarg = int4, leftarg = int4);
 select count(*) from vectorqual where metric3 === 777;
 select count(*) from vectorqual where metric3 === any(array[777, 888]);
+select count(*) from vectorqual where not metric3 === 777;
+select count(*) from vectorqual where metric3 = 777 or metric3 === 777;
 
 -- It also doesn't have a commutator.
 select count(*) from vectorqual where 777 === metric3;
@@ -217,6 +243,51 @@ select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 0])
 select count(*) from singlebatch where metric2 <= all(array[12, 0, 12, 12, 12]) and metric3 != 777;
 select count(*) from singlebatch where metric2 <= all(array[12, 12, 12, 12, 12]) and metric3 != 777;
 
+
+-- Also check early exit for AND/OR. Top-level clause must be OR, because top-level
+-- AND is flattened into a list.
+select count(*) from singlebatch where (metric2 < 20 and metric2 < 30) or metric3 = 777;
+select count(*) from singlebatch where (metric2 < 30 and metric2 < 20) or metric3 = 777;
+select count(*) from singlebatch where metric3 = 777 or (metric2 < 20 and metric2 < 30);
+select count(*) from singlebatch where metric3 = 777 or (metric2 < 30 and metric2 < 20);
+
+select count(*) from vectorqual where (metric2 < 20 and metric2 < 30) or metric3 = 777;
+select count(*) from vectorqual where (metric2 < 30 and metric2 < 20) or metric3 = 777;
+select count(*) from vectorqual where metric3 = 777 or (metric2 < 20 and metric2 < 30);
+select count(*) from vectorqual where metric3 = 777 or (metric2 < 30 and metric2 < 20);
+
+select count(*) from singlebatch where metric2 < 20 or metric3 < 50 or metric3 > 50;
+select count(*) from singlebatch where metric2 < 20 or metric3 > 50 or metric3 < 50;
+select count(*) from singlebatch where metric3 < 50 or metric2 < 20 or metric3 > 50;
+select count(*) from singlebatch where metric3 > 50 or metric3 < 50 or metric2 < 20;
+
+select count(*) from vectorqual where metric2 < 20 or metric3 < 50 or metric3 > 50;
+select count(*) from vectorqual where metric2 < 20 or metric3 > 50 or metric3 < 50;
+select count(*) from vectorqual where metric3 < 50 or metric2 < 20 or metric3 > 50;
+select count(*) from vectorqual where metric3 > 50 or metric3 < 50 or metric2 < 20;
+
+select count(*) from singlebatch where metric2 = 12 or metric3 = 888;
+select count(*) from singlebatch where metric2 = 22 or metric3 = 888;
+select count(*) from singlebatch where metric2 = 32 or metric3 = 888;
+select count(*) from singlebatch where metric2 = 42 or metric3 = 888;
+select count(*) from singlebatch where metric2 = 52 or metric3 = 888;
+
+select count(*) from vectorqual where metric2 = 12 or metric3 = 888;
+select count(*) from vectorqual where metric2 = 22 or metric3 = 888;
+select count(*) from vectorqual where metric2 = 32 or metric3 = 888;
+select count(*) from vectorqual where metric2 = 42 or metric3 = 888;
+select count(*) from vectorqual where metric2 = 52 or metric3 = 888;
+
+select count(*) from singlebatch where ts > '2024-01-01' or (metric3 = 777 and metric2 = 12);
+select count(*) from singlebatch where ts > '2024-01-01' or (metric3 = 777 and metric2 = 666);
+select count(*) from singlebatch where ts > '2024-01-01' or (metric3 = 888 and metric2 = 12);
+select count(*) from singlebatch where ts > '2024-01-01' or (metric3 = 888 and metric2 = 666);
+
+select count(*) from vectorqual where ts > '2024-01-01' or (metric3 = 777 and metric2 = 12);
+select count(*) from vectorqual where ts > '2024-01-01' or (metric3 = 777 and metric2 = 666);
+select count(*) from vectorqual where ts > '2024-01-01' or (metric3 = 888 and metric2 = 12);
+select count(*) from vectorqual where ts > '2024-01-01' or (metric3 = 888 and metric2 = 666);
+
 reset timescaledb.enable_bulk_decompression;
 reset timescaledb.debug_require_vector_qual;