Skip to content

Commit

Permalink
Merge branch 'main' into rewrite-data-where-sql
Browse files Browse the repository at this point in the history
  • Loading branch information
ludlows authored Nov 30, 2024
2 parents ff18bf6 + e770fac commit a3957ee
Show file tree
Hide file tree
Showing 554 changed files with 16,738 additions and 5,285 deletions.
2 changes: 2 additions & 0 deletions .asf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ github:
required_approving_review_count: 1

required_linear_history: true

del_branch_on_merge: true

features:
wiki: true
Expand Down
3 changes: 2 additions & 1 deletion .github/ISSUE_TEMPLATE/iceberg_bug_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ body:
description: What Apache Iceberg version are you using?
multiple: false
options:
- "1.6.1 (latest release)"
- "1.7.0 (latest release)"
- "1.6.1"
- "1.6.0"
- "1.5.2"
- "1.5.1"
Expand Down
6 changes: 0 additions & 6 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,6 @@ MR:
'mr/**/*'
]

PIG:
- changed-files:
- any-glob-to-any-file: [
'pig/**/*'
]

AWS:
- changed-files:
- any-glob-to-any-file: [
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/delta-conversion-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ on:
- 'hive-runtime/**'
- 'flink/**'
- 'kafka-connect/**'
- 'pig/**'
- 'docs/**'
- 'site/**'
- 'open-api/**'
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/flink-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ on:
- 'hive-runtime/**'
- 'kafka-connect/**'
- 'spark/**'
- 'pig/**'
- 'docs/**'
- 'site/**'
- 'open-api/**'
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/hive-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ on:
- 'spark/**'
- 'flink/**'
- 'kafka-connect/**'
- 'pig/**'
- 'docs/**'
- 'site/**'
- 'open-api/**'
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/kafka-connect-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ on:
- 'hive3-orc-bundle/**'
- 'hive-runtime/**'
- 'spark/**'
- 'pig/**'
- 'docs/**'
- 'site/**'
- 'open-api/**'
Expand Down
55 changes: 55 additions & 0 deletions .github/workflows/publish-docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: Build and Push Docker Image

on:
push:
tags:
- 'apache-iceberg-[0-9]+.[0-9]+.[0-9]+'
workflow_dispatch:

env:
DOCKER_IMAGE_TAG: iceberg-rest-fixture
DOCKER_IMAGE_VERSION: latest

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-java@v4
with:
distribution: zulu
java-version: 21
- name: Build Iceberg Open API project
run: ./gradlew :iceberg-open-api:shadowJar
- name: Login to Docker Hub
run: |
docker login -u ${{ secrets.DOCKERHUB_USER }} -p ${{ secrets.DOCKERHUB_TOKEN }}
- name: Set the tagged version
# for tag 'apache-iceberg-1.7.0', publish image 'apache/iceberg-rest-fixture:1.7.1'
if: github.event_name == 'push' && contains(github.ref, 'refs/tags/')
run: |
echo "DOCKER_IMAGE_VERSION=`echo ${{ github.ref }} | tr -d -c 0-9.`" >> "$GITHUB_ENV"
- name: Build Docker Image
run: docker build -t ${{ secrets.DOCKERHUB_USER }}/$DOCKER_IMAGE_TAG:$DOCKER_IMAGE_VERSION -f docker/iceberg-rest-adapter-image/Dockerfile .
- name: Push Docker Image
run: |
docker push ${{ secrets.DOCKERHUB_USER }}/$DOCKER_IMAGE_TAG:$DOCKER_IMAGE_VERSION
1 change: 0 additions & 1 deletion .github/workflows/spark-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ on:
- 'hive-runtime/**'
- 'flink/**'
- 'kafka-connect/**'
- 'pig/**'
- 'docs/**'
- 'open-api/**'
- 'format/**'
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ Iceberg also has modules for adding Iceberg support to processing engines:
* `iceberg-spark` is an implementation of Spark's Datasource V2 API for Iceberg with submodules for each spark versions (use runtime jars for a shaded version)
* `iceberg-flink` contains classes for integrating with Apache Flink (use iceberg-flink-runtime for a shaded version)
* `iceberg-mr` contains an InputFormat and other classes for integrating with Apache Hive
* `iceberg-pig` is an implementation of Pig's LoadFunc API for Iceberg

---
**NOTE**
Expand Down
3 changes: 2 additions & 1 deletion api/src/main/java/org/apache/iceberg/AddedRowsScanTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.iceberg;

import java.util.List;
import org.apache.iceberg.util.ScanTaskUtil;

/**
* A scan task for inserts generated by adding a data file to the table.
Expand Down Expand Up @@ -55,7 +56,7 @@ default ChangelogOperation operation() {

@Override
default long sizeBytes() {
return length() + deletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum();
return length() + ScanTaskUtil.contentSizeInBytes(deletes());
}

@Override
Expand Down
15 changes: 13 additions & 2 deletions api/src/main/java/org/apache/iceberg/DataFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,21 @@ public interface DataFile extends ContentFile<DataFile> {
"referenced_data_file",
StringType.get(),
"Fully qualified location (URI with FS scheme) of a data file that all deletes reference");
Types.NestedField CONTENT_OFFSET =
optional(
144, "content_offset", LongType.get(), "The offset in the file where the content starts");
Types.NestedField CONTENT_SIZE =
optional(
145,
"content_size_in_bytes",
LongType.get(),
"The length of referenced content stored in the file");

int PARTITION_ID = 102;
String PARTITION_NAME = "partition";
String PARTITION_DOC = "Partition data tuple, schema based on the partition spec";

// NEXT ID TO ASSIGN: 144
// NEXT ID TO ASSIGN: 146

static StructType getType(StructType partitionType) {
// IDs start at 100 to leave room for changes to ManifestEntry
Expand All @@ -131,7 +140,9 @@ static StructType getType(StructType partitionType) {
SPLIT_OFFSETS,
EQUALITY_IDS,
SORT_ORDER_ID,
REFERENCED_DATA_FILE);
REFERENCED_DATA_FILE,
CONTENT_OFFSET,
CONTENT_SIZE);
}

/**
Expand Down
22 changes: 22 additions & 0 deletions api/src/main/java/org/apache/iceberg/DeleteFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,26 @@ default List<Long> splitOffsets() {
default String referencedDataFile() {
return null;
}

/**
* Returns the offset in the file where the content starts.
*
* <p>The content offset is required for deletion vectors and points to the start of the deletion
* vector blob in the Puffin file, enabling direct access. This method always returns null for
* equality and position delete files.
*/
default Long contentOffset() {
return null;
}

/**
* Returns the length of referenced content stored in the file.
*
* <p>The content size is required for deletion vectors and indicates the size of the deletion
* vector blob in the Puffin file, enabling direct access. This method always returns null for
* equality and position delete files.
*/
default Long contentSizeInBytes() {
return null;
}
}
2 changes: 1 addition & 1 deletion api/src/main/java/org/apache/iceberg/DeleteFiles.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public interface DeleteFiles extends SnapshotUpdate<DeleteFiles> {
* @return this for method chaining
*/
default DeleteFiles deleteFile(DataFile file) {
deleteFile(file.path());
deleteFile(file.location());
return this;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.iceberg;

import java.util.List;
import org.apache.iceberg.util.ScanTaskUtil;

/**
* A scan task for deletes generated by removing a data file from the table.
Expand Down Expand Up @@ -54,7 +55,7 @@ default ChangelogOperation operation() {

@Override
default long sizeBytes() {
return length() + existingDeletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum();
return length() + ScanTaskUtil.contentSizeInBytes(existingDeletes());
}

@Override
Expand Down
7 changes: 4 additions & 3 deletions api/src/main/java/org/apache/iceberg/DeletedRowsScanTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.iceberg;

import java.util.List;
import org.apache.iceberg.util.ScanTaskUtil;

/**
* A scan task for deletes generated by adding delete files to the table.
Expand Down Expand Up @@ -63,9 +64,9 @@ default ChangelogOperation operation() {

@Override
default long sizeBytes() {
return length()
+ addedDeletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum()
+ existingDeletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum();
long addedDeletesSize = ScanTaskUtil.contentSizeInBytes(addedDeletes());
long existingDeletesSize = ScanTaskUtil.contentSizeInBytes(existingDeletes());
return length() + addedDeletesSize + existingDeletesSize;
}

@Override
Expand Down
1 change: 1 addition & 0 deletions api/src/main/java/org/apache/iceberg/FileFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

/** Enum of supported file formats. */
public enum FileFormat {
PUFFIN("puffin", false),
ORC("orc", true),
PARQUET("parquet", true),
AVRO("avro", true),
Expand Down
3 changes: 2 additions & 1 deletion api/src/main/java/org/apache/iceberg/FileScanTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.iceberg;

import java.util.List;
import org.apache.iceberg.util.ScanTaskUtil;

/** A scan task over a range of bytes in a single data file. */
public interface FileScanTask extends ContentScanTask<DataFile>, SplittableScanTask<FileScanTask> {
Expand All @@ -36,7 +37,7 @@ default Schema schema() {

@Override
default long sizeBytes() {
return length() + deletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum();
return length() + ScanTaskUtil.contentSizeInBytes(deletes());
}

@Override
Expand Down
2 changes: 1 addition & 1 deletion api/src/main/java/org/apache/iceberg/ManifestFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public interface ManifestFile {
Types.LongType.get(),
"Lowest sequence number in the manifest");
Types.NestedField SNAPSHOT_ID =
optional(
required(
503, "added_snapshot_id", Types.LongType.get(), "Snapshot ID that added the manifest");
Types.NestedField ADDED_FILES_COUNT =
optional(504, "added_files_count", Types.IntegerType.get(), "Added entry count");
Expand Down
10 changes: 7 additions & 3 deletions api/src/main/java/org/apache/iceberg/Schema.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.BiMap;
Expand All @@ -54,9 +55,12 @@ public class Schema implements Serializable {
private static final Joiner NEWLINE = Joiner.on('\n');
private static final String ALL_COLUMNS = "*";
private static final int DEFAULT_SCHEMA_ID = 0;
private static final int DEFAULT_VALUES_MIN_FORMAT_VERSION = 3;
private static final Map<Type.TypeID, Integer> MIN_FORMAT_VERSIONS =
ImmutableMap.of(Type.TypeID.TIMESTAMP_NANO, 3);

@VisibleForTesting static final int DEFAULT_VALUES_MIN_FORMAT_VERSION = 3;

@VisibleForTesting
static final Map<Type.TypeID, Integer> MIN_FORMAT_VERSIONS =
ImmutableMap.of(Type.TypeID.TIMESTAMP_NANO, 3, Type.TypeID.VARIANT, 3);

private final StructType struct;
private final int schemaId;
Expand Down
4 changes: 3 additions & 1 deletion api/src/main/java/org/apache/iceberg/UpdateSchema.java
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,9 @@ default UpdateSchema updateColumn(String name, Type.PrimitiveType newType, Strin
* to create a union schema.
*
* <p>For fields with same canonical names in both schemas it is required that the widen types is
* supported using {@link UpdateSchema#updateColumn(String, Type.PrimitiveType)}
* supported using {@link UpdateSchema#updateColumn(String, Type.PrimitiveType)}. Differences in
* type are ignored if the new type is narrower than the existing type (e.g. long to int, double
* to float).
*
* <p>Only supports turning a previously required field into an optional one if it is marked
* optional in the provided new schema using {@link UpdateSchema#makeColumnOptional(String)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,9 @@ public InputFile newInputFile(DeleteFile file) {

private InputFile newInputFile(ContentFile<?> file) {
if (file.keyMetadata() != null) {
return newDecryptingInputFile(
file.path().toString(), file.fileSizeInBytes(), file.keyMetadata());
return newDecryptingInputFile(file.location(), file.fileSizeInBytes(), file.keyMetadata());
} else {
return newInputFile(file.path().toString(), file.fileSizeInBytes());
return newInputFile(file.location(), file.fileSizeInBytes());
}
}

Expand Down Expand Up @@ -148,7 +147,7 @@ public void close() {
}

private SimpleEncryptedInputFile wrap(ContentFile<?> file) {
InputFile encryptedInputFile = io.newInputFile(file.path().toString(), file.fileSizeInBytes());
InputFile encryptedInputFile = io.newInputFile(file.location(), file.fileSizeInBytes());
return new SimpleEncryptedInputFile(encryptedInputFile, toKeyMetadata(file.keyMetadata()));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -337,9 +337,9 @@ public <T> Expression predicate(UnboundPredicate<T> pred) {
pred.op(), pred.term(), (T) sanitize(pred.literal(), now, today));
case IN:
case NOT_IN:
Iterable<String> iter =
() -> pred.literals().stream().map(lit -> sanitize(lit, now, today)).iterator();
return new UnboundPredicate<>(pred.op(), pred.term(), (Iterable<T>) iter);
Iterable<T> iter =
() -> pred.literals().stream().map(lit -> (T) sanitize(lit, now, today)).iterator();
return new UnboundPredicate<>(pred.op(), pred.term(), iter);
default:
throw new UnsupportedOperationException(
"Cannot sanitize unsupported predicate type: " + pred.op());
Expand Down Expand Up @@ -534,7 +534,8 @@ private static String sanitize(Type type, Object value, long now, int today) {
case DECIMAL:
case FIXED:
case BINARY:
// for boolean, uuid, decimal, fixed, and binary, match the string result
case VARIANT:
// for boolean, uuid, decimal, fixed, variant, and binary, match the string result
return sanitizeSimpleString(value.toString());
}
throw new UnsupportedOperationException(
Expand Down Expand Up @@ -562,7 +563,7 @@ private static String sanitize(Literal<?> literal, long now, int today) {
} else if (literal instanceof Literals.DoubleLiteral) {
return sanitizeNumber(((Literals.DoubleLiteral) literal).value(), "float");
} else {
// for uuid, decimal, fixed, and binary, match the string result
// for uuid, decimal, fixed, variant, and binary, match the string result
return sanitizeSimpleString(literal.value().toString());
}
}
Expand Down
Loading

0 comments on commit a3957ee

Please sign in to comment.