Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spark: Remove extra columns for ColumnBatch #11551

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions data/src/main/java/org/apache/iceberg/data/DeleteFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public abstract class DeleteFilter<T> {
private final List<DeleteFile> posDeletes;
private final List<DeleteFile> eqDeletes;
private final Schema requiredSchema;
private final Schema expectedSchema;
private final Accessor<StructLike> posAccessor;
private final boolean hasIsDeletedColumn;
private final int isDeletedColumnPosition;
Expand All @@ -68,11 +69,12 @@ protected DeleteFilter(
String filePath,
List<DeleteFile> deletes,
Schema tableSchema,
Schema requestedSchema,
Schema expectedSchema,
DeleteCounter counter,
boolean needRowPosCol) {
this.filePath = filePath;
this.counter = counter;
this.expectedSchema = expectedSchema;

ImmutableList.Builder<DeleteFile> posDeleteBuilder = ImmutableList.builder();
ImmutableList.Builder<DeleteFile> eqDeleteBuilder = ImmutableList.builder();
Expand All @@ -95,7 +97,7 @@ protected DeleteFilter(
this.posDeletes = posDeleteBuilder.build();
this.eqDeletes = eqDeleteBuilder.build();
this.requiredSchema =
fileProjection(tableSchema, requestedSchema, posDeletes, eqDeletes, needRowPosCol);
fileProjection(tableSchema, expectedSchema, posDeletes, eqDeletes, needRowPosCol);
this.posAccessor = requiredSchema.accessorForField(MetadataColumns.ROW_POSITION.fieldId());
this.hasIsDeletedColumn =
requiredSchema.findField(MetadataColumns.IS_DELETED.fieldId()) != null;
Expand Down Expand Up @@ -124,6 +126,10 @@ public Schema requiredSchema() {
return requiredSchema;
}

public Schema expectedSchema() {
return expectedSchema;
}

public boolean hasPosDeletes() {
return !posDeletes.isEmpty();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ public void cleanup() throws IOException {
dropTable("test2");
}

private void initDateTable() throws IOException {
protected void initDateTable() throws IOException {
dropTable("test2");
this.dateTableName = "test2";
this.dateTable = createTable(dateTableName, DATE_SCHEMA, DATE_SPEC);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*/
package org.apache.iceberg.spark.data.vectorized;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -102,6 +103,7 @@ ColumnarBatch loadDataToColumnBatch() {

if (hasEqDeletes()) {
applyEqDelete(newColumnarBatch);
newColumnarBatch = removeExtraColumns(arrowColumnVectors, newColumnarBatch);
}

if (hasIsDeletedColumn && rowIdMapping != null) {
Expand Down Expand Up @@ -245,5 +247,34 @@ void applyEqDelete(ColumnarBatch columnarBatch) {

columnarBatch.setNumRows(currentRowId);
}

/**
* Removes extra columns added for processing equality delete filters that are not part of the
* final query output.
*
* <p>During query execution, additional columns may be included in the schema to evaluate
* equality delete filters. For example, if the table schema contains columns C1, C2, C3, C4,
* and C5, and the query is 'SELECT C5 FROM table' while equality delete filters are applied on
* C3 and C4, the processing schema includes C5, C3, and C4. These extra columns (C3 and C4) are
* needed to identify rows to delete but are not included in the final result.
*
* <p>This method removes these extra columns from the end of {@code arrowColumnVectors},
* ensuring only the expected columns remain.
*
* @param arrowColumnVectors the array of column vectors representing query result data
* @param columnarBatch the original {@code ColumnarBatch} containing query results
* @return a new {@code ColumnarBatch} with extra columns removed, or the original batch if no
* extra columns were found
*/
ColumnarBatch removeExtraColumns(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be helpful to keep the following comments as part of the JavaDoc for the method to provide clarity and context:

    /**
     * Removes extra columns added for processing equality delete filters that are not part of the
     * final query output.
     * <p>
     * During query execution, additional columns may be included in the schema to evaluate equality
     * delete filters. For example, if the table schema contains columns C1, C2, C3, C4, and C5, and
     * the query is 'SELECT C5 FROM table' while equality delete filters are applied on C3 and C4,
     * the processing schema includes C5, C3, and C4. These extra columns (C3 and C4) are needed
     * to identify rows to delete but are not included in the final result.
     * <p>
     * This method removes these extra columns from the end of {@code arrowColumnVectors}, ensuring
     * only the expected columns remain.
     *
     * @param arrowColumnVectors the array of column vectors representing query result data
     * @param columnarBatch the original {@code ColumnarBatch} containing query results
     * @return a new {@code ColumnarBatch} with extra columns removed, or the original batch if no
     *         extra columns were found
     */

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, missed this part. Added.

ColumnVector[] arrowColumnVectors, ColumnarBatch columnarBatch) {
int expectedColumnSize = deletes.expectedSchema().columns().size();
if (arrowColumnVectors.length > expectedColumnSize) {
ColumnVector[] newColumns = Arrays.copyOf(arrowColumnVectors, expectedColumnSize);
return new ColumnarBatch(newColumns, columnarBatch.numRows());
} else {
return columnarBatch;
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS;
import static org.apache.iceberg.spark.source.SparkSQLExecutionHelper.lastExecutedMetricValue;
import static org.apache.iceberg.types.Types.NestedField.required;
import static org.apache.spark.sql.types.DataTypes.IntegerType;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assumptions.assumeThat;

import java.io.File;
import java.io.IOException;
import java.time.LocalDate;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
Expand Down Expand Up @@ -86,6 +88,7 @@
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.internal.SQLConf;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.vectorized.ColumnarBatch;
import org.jetbrains.annotations.NotNull;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
Expand All @@ -95,7 +98,6 @@

@ExtendWith(ParameterizedTestExtension.class)
public class TestSparkReaderDeletes extends DeleteReadTests {

private static TestHiveMetastore metastore = null;
protected static SparkSession spark = null;
protected static HiveCatalog catalog = null;
Expand Down Expand Up @@ -622,6 +624,51 @@ public void testPosDeletesOnParquetFileWithMultipleRowGroups() throws IOExceptio
assertThat(rowSet(tblName, tbl, "*")).hasSize(193);
}

@TestTemplate
public void testEqualityDeleteWithDifferentScanAndDeleteColumns() throws IOException {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is expected to pass even without the fix provided by this PR. Currently, the extra columns returned to Spark do not cause any problems. However, with Comet native execution, since Comet allocates arrays in a pre-allocated list and relies on the requested schema to determine the number of columns in the batch, this test would fail without the fix proposed in this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to check the intermediate results? for example, checking the ColumnarBatch returned to Spark. We may avoid using comet as a dependency for the test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have changed the test to check the number of columns in ColumnarBatch

assumeThat(format).isEqualTo(FileFormat.PARQUET);
initDateTable();

Schema deleteRowSchema = dateTable.schema().select("dt");
Record dataDelete = GenericRecord.create(deleteRowSchema);
List<Record> dataDeletes =
Lists.newArrayList(
dataDelete.copy("dt", LocalDate.parse("2021-09-01")),
dataDelete.copy("dt", LocalDate.parse("2021-09-02")),
dataDelete.copy("dt", LocalDate.parse("2021-09-03")));

DeleteFile eqDeletes =
FileHelpers.writeDeleteFile(
dateTable,
Files.localOutput(File.createTempFile("junit", null, temp.toFile())),
TestHelpers.Row.of(0),
dataDeletes.subList(0, 3),
deleteRowSchema);

dateTable.newRowDelta().addDeletes(eqDeletes).commit();

CloseableIterable<CombinedScanTask> tasks =
TableScanUtil.planTasks(
dateTable.newScan().planFiles(),
TableProperties.METADATA_SPLIT_SIZE_DEFAULT,
TableProperties.SPLIT_LOOKBACK_DEFAULT,
TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT);

for (CombinedScanTask task : tasks) {
try (BatchDataReader reader =
new BatchDataReader(
// expected column is id, while the equality filter column is dt
dateTable, task, dateTable.schema(), dateTable.schema().select("id"), false, 7)) {
while (reader.next()) {
ColumnarBatch columnarBatch = reader.get();
int numOfCols = columnarBatch.numCols();
assertThat(numOfCols).as("Number of columns").isEqualTo(1);
Comment on lines +664 to +665
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: also check the column type to make sure dt is removed like following?

          // only the expected column(id) is kept
          assertThat(columnarBatch.numCols()).as("Number of columns").isEqualTo(1);
          assertThat(columnarBatch.column(0).dataType()).as("Column type").isEqualTo(IntegerType);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added. Thanks!

assertThat(columnarBatch.column(0).dataType()).as("Column type").isEqualTo(IntegerType);
}
}
}
}

private static final Schema PROJECTION_SCHEMA =
new Schema(
required(1, "id", Types.IntegerType.get()),
Expand Down