Skip to content

Commit

Permalink
Core: Do not use a lazy split offset list in manifests (#8834)
Browse files Browse the repository at this point in the history
The list was not correctly invalidated when reusing the file.
  • Loading branch information
bryanck authored Oct 16, 2023
1 parent 05c789b commit 46cad6d
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 14 deletions.
9 changes: 1 addition & 8 deletions core/src/main/java/org/apache/iceberg/BaseFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,6 @@ public PartitionData copy() {
// cached schema
private transient Schema avroSchema = null;

// lazy variables
private transient volatile List<Long> splitOffsetList = null;

/** Used by Avro reflection to instantiate this class when reading manifest files. */
BaseFile(Schema avroSchema) {
this.avroSchema = avroSchema;
Expand Down Expand Up @@ -463,11 +460,7 @@ public ByteBuffer keyMetadata() {

@Override
public List<Long> splitOffsets() {
if (splitOffsetList == null && splitOffsets != null) {
this.splitOffsetList = ArrayUtil.toUnmodifiableLongList(splitOffsets);
}

return splitOffsetList;
return ArrayUtil.toUnmodifiableLongList(splitOffsets);
}

long[] splitOffsetArray() {
Expand Down
3 changes: 3 additions & 0 deletions core/src/test/java/org/apache/iceberg/TableTestBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ public class TableTestBase {
.withFileSizeInBytes(10)
.withPartitionPath("data_bucket=1") // easy way to set partition data for now
.withRecordCount(1)
.withSplitOffsets(ImmutableList.of(1L))
.build();
static final DeleteFile FILE_B_DELETES =
FileMetadata.deleteFileBuilder(SPEC)
Expand All @@ -112,6 +113,7 @@ public class TableTestBase {
.withFileSizeInBytes(10)
.withPartitionPath("data_bucket=2") // easy way to set partition data for now
.withRecordCount(1)
.withSplitOffsets(ImmutableList.of(2L, 2_000_000L))
.build();
static final DeleteFile FILE_C2_DELETES =
FileMetadata.deleteFileBuilder(SPEC)
Expand All @@ -127,6 +129,7 @@ public class TableTestBase {
.withFileSizeInBytes(10)
.withPartitionPath("data_bucket=3") // easy way to set partition data for now
.withRecordCount(1)
.withSplitOffsets(ImmutableList.of(3L, 3_000L, 3_000_000L))
.build();
static final DeleteFile FILE_D2_DELETES =
FileMetadata.deleteFileBuilder(SPEC)
Expand Down
19 changes: 13 additions & 6 deletions core/src/test/java/org/apache/iceberg/TestManifestReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
*/
package org.apache.iceberg;

import static org.assertj.core.api.Assertions.assertThat;

import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;
Expand All @@ -28,6 +30,7 @@
import org.apache.iceberg.relocated.com.google.common.collect.Streams;
import org.apache.iceberg.types.Types;
import org.assertj.core.api.Assertions;
import org.assertj.core.api.recursive.comparison.RecursiveComparisonConfiguration;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Test;
Expand All @@ -41,6 +44,12 @@ public static Object[] parameters() {
return new Object[] {1, 2};
}

private static final RecursiveComparisonConfiguration FILE_COMPARISON_CONFIG =
RecursiveComparisonConfiguration.builder()
.withIgnoredFields(
"dataSequenceNumber", "fileOrdinal", "fileSequenceNumber", "fromProjectionPos")
.build();

public TestManifestReader(int formatVersion) {
super(formatVersion);
}
Expand All @@ -61,16 +70,14 @@ public void testReaderWithFilterWithoutSelect() throws IOException {
ManifestFile manifest = writeManifest(1000L, FILE_A, FILE_B, FILE_C);
try (ManifestReader<DataFile> reader =
ManifestFiles.read(manifest, FILE_IO).filterRows(Expressions.equal("id", 0))) {
List<String> files =
Streams.stream(reader).map(file -> file.path().toString()).collect(Collectors.toList());
List<DataFile> files = Streams.stream(reader).collect(Collectors.toList());

// note that all files are returned because the reader returns data files that may match, and
// the partition is
// bucketing by data, which doesn't help filter files
Assert.assertEquals(
"Should read the expected files",
Lists.newArrayList(FILE_A.path(), FILE_B.path(), FILE_C.path()),
files);
assertThat(files)
.usingRecursiveComparison(FILE_COMPARISON_CONFIG)
.isEqualTo(Lists.newArrayList(FILE_A, FILE_B, FILE_C));
}
}

Expand Down

0 comments on commit 46cad6d

Please sign in to comment.