Skip to content

Commit

Permalink
Core: Optimize computing user-facing state in data tasks (apache#8346)
Browse files Browse the repository at this point in the history
  • Loading branch information
aokolnychyi authored Aug 24, 2023
1 parent d61159e commit 181d3e2
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 10 deletions.
40 changes: 34 additions & 6 deletions api/src/main/java/org/apache/iceberg/BaseScanTaskGroup.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
package org.apache.iceberg;

import java.util.Collection;
import java.util.List;
import java.util.Collections;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
Expand All @@ -28,12 +28,13 @@
public class BaseScanTaskGroup<T extends ScanTask> implements ScanTaskGroup<T> {
private final StructLike groupingKey;
private final Object[] tasks;
private transient volatile List<T> taskList;
private transient volatile Collection<T> taskCollection;

public BaseScanTaskGroup(StructLike groupingKey, Collection<T> tasks) {
Preconditions.checkNotNull(tasks, "tasks cannot be null");
this.groupingKey = groupingKey;
this.tasks = tasks.toArray();
this.taskCollection = Collections.unmodifiableCollection(tasks);
}

public BaseScanTaskGroup(Collection<T> tasks) {
Expand All @@ -48,20 +49,47 @@ public StructLike groupingKey() {
@Override
@SuppressWarnings("unchecked")
public Collection<T> tasks() {
if (taskList == null) {
if (taskCollection == null) {
synchronized (this) {
if (taskList == null) {
if (taskCollection == null) {
ImmutableList.Builder<T> listBuilder =
ImmutableList.builderWithExpectedSize(tasks.length);
for (Object task : tasks) {
listBuilder.add((T) task);
}
taskList = listBuilder.build();
this.taskCollection = listBuilder.build();
}
}
}

return taskList;
return taskCollection;
}

@Override
public long sizeBytes() {
long sizeBytes = 0L;
for (Object task : tasks) {
sizeBytes += ((ScanTask) task).sizeBytes();
}
return sizeBytes;
}

@Override
public long estimatedRowsCount() {
long estimatedRowsCount = 0L;
for (Object task : tasks) {
estimatedRowsCount += ((ScanTask) task).estimatedRowsCount();
}
return estimatedRowsCount;
}

@Override
public int filesCount() {
int filesCount = 0;
for (Object task : tasks) {
filesCount += ((ScanTask) task).filesCount();
}
return filesCount;
}

@Override
Expand Down
34 changes: 33 additions & 1 deletion core/src/main/java/org/apache/iceberg/BaseCombinedScanTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

public class BaseCombinedScanTask implements CombinedScanTask {
private final FileScanTask[] tasks;
private transient volatile List<FileScanTask> taskList = null;

public BaseCombinedScanTask(FileScanTask... tasks) {
Preconditions.checkNotNull(tasks, "tasks cannot be null");
Expand All @@ -41,7 +42,38 @@ public BaseCombinedScanTask(List<FileScanTask> tasks) {

@Override
public Collection<FileScanTask> files() {
return ImmutableList.copyOf(tasks);
if (taskList == null) {
this.taskList = ImmutableList.copyOf(tasks);
}

return taskList;
}

@Override
public long sizeBytes() {
long sizeBytes = 0L;
for (FileScanTask task : tasks) {
sizeBytes += task.sizeBytes();
}
return sizeBytes;
}

@Override
public long estimatedRowsCount() {
long estimatedRowsCount = 0L;
for (FileScanTask task : tasks) {
estimatedRowsCount += task.estimatedRowsCount();
}
return estimatedRowsCount;
}

@Override
public int filesCount() {
int filesCount = 0;
for (FileScanTask task : tasks) {
filesCount += task.filesCount();
}
return filesCount;
}

@Override
Expand Down
65 changes: 62 additions & 3 deletions core/src/main/java/org/apache/iceberg/BaseFileScanTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
public class BaseFileScanTask extends BaseContentScanTask<FileScanTask, DataFile>
implements FileScanTask {
private final DeleteFile[] deletes;
private transient volatile List<DeleteFile> deleteList = null;
private transient volatile long deletesSizeBytes = 0L;

public BaseFileScanTask(
DataFile file,
Expand All @@ -45,31 +47,66 @@ protected FileScanTask self() {

@Override
protected FileScanTask newSplitTask(FileScanTask parentTask, long offset, long length) {
return new SplitScanTask(offset, length, parentTask);
return new SplitScanTask(offset, length, parentTask, deletesSizeBytes());
}

@Override
public List<DeleteFile> deletes() {
return ImmutableList.copyOf(deletes);
if (deleteList == null) {
this.deleteList = ImmutableList.copyOf(deletes);
}

return deleteList;
}

@Override
public long sizeBytes() {
return length() + deletesSizeBytes();
}

@Override
public int filesCount() {
return 1 + deletes.length;
}

@Override
public Schema schema() {
return super.schema();
}

// lazily cache the size of deletes to reuse in all split tasks
private long deletesSizeBytes() {
if (deletesSizeBytes == 0L && deletes.length > 0) {
long size = 0L;
for (DeleteFile deleteFile : deletes) {
size += deleteFile.fileSizeInBytes();
}
this.deletesSizeBytes = size;
}

return deletesSizeBytes;
}

@VisibleForTesting
static final class SplitScanTask implements FileScanTask, MergeableScanTask<SplitScanTask> {
private final long len;
private final long offset;
private final FileScanTask fileScanTask;
private transient volatile long deletesSizeBytes = 0L;

SplitScanTask(long offset, long len, FileScanTask fileScanTask) {
this.offset = offset;
this.len = len;
this.fileScanTask = fileScanTask;
}

SplitScanTask(long offset, long len, FileScanTask fileScanTask, long deletesSizeBytes) {
this.offset = offset;
this.len = len;
this.fileScanTask = fileScanTask;
this.deletesSizeBytes = deletesSizeBytes;
}

@Override
public DataFile file() {
return fileScanTask.file();
Expand Down Expand Up @@ -105,6 +142,16 @@ public long estimatedRowsCount() {
return BaseContentScanTask.estimateRowsCount(len, fileScanTask.file());
}

@Override
public long sizeBytes() {
return len + deletesSizeBytes();
}

@Override
public int filesCount() {
return fileScanTask.filesCount();
}

@Override
public Expression residual() {
return fileScanTask.residual();
Expand All @@ -128,7 +175,19 @@ public boolean canMerge(ScanTask other) {
@Override
public SplitScanTask merge(ScanTask other) {
SplitScanTask that = (SplitScanTask) other;
return new SplitScanTask(offset, len + that.length(), fileScanTask);
return new SplitScanTask(offset, len + that.length(), fileScanTask, deletesSizeBytes);
}

private long deletesSizeBytes() {
if (deletesSizeBytes == 0L && fileScanTask.filesCount() > 1) {
long size = 0L;
for (DeleteFile deleteFile : fileScanTask.deletes()) {
size += deleteFile.fileSizeInBytes();
}
this.deletesSizeBytes = size;
}

return deletesSizeBytes;
}
}
}

0 comments on commit 181d3e2

Please sign in to comment.