Merge branch 'main' into rewrite-data-where-sql

apache · Nov 30, 2024 · a3957ee · a3957ee
2 parents ff18bf6 + e770fac
commit a3957ee
Show file tree

Hide file tree

Showing 554 changed files with 16,738 additions and 5,285 deletions.
diff --git a/.asf.yaml b/.asf.yaml
@@ -39,6 +39,8 @@ github:
         required_approving_review_count: 1
 
       required_linear_history: true
+
+  del_branch_on_merge: true
 
   features:
     wiki: true

diff --git a/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml b/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml
@@ -28,7 +28,8 @@ body:
       description: What Apache Iceberg version are you using?
       multiple: false
       options:
-        - "1.6.1 (latest release)"
+        - "1.7.0 (latest release)"
+        - "1.6.1"
         - "1.6.0"
         - "1.5.2"
         - "1.5.1"

diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -130,12 +130,6 @@ MR:
       'mr/**/*'
     ]
 
-PIG:
-  - changed-files:
-    - any-glob-to-any-file: [
-      'pig/**/*'
-    ]
-
 AWS:
   - changed-files:
     - any-glob-to-any-file: [

diff --git a/.github/workflows/delta-conversion-ci.yml b/.github/workflows/delta-conversion-ci.yml
@@ -53,7 +53,6 @@ on:
       - 'hive-runtime/**'
       - 'flink/**'
       - 'kafka-connect/**'
-      - 'pig/**'
       - 'docs/**'
       - 'site/**'
       - 'open-api/**'

diff --git a/.github/workflows/flink-ci.yml b/.github/workflows/flink-ci.yml
@@ -53,7 +53,6 @@ on:
     - 'hive-runtime/**'
     - 'kafka-connect/**'
     - 'spark/**'
-    - 'pig/**'
     - 'docs/**'
     - 'site/**'
     - 'open-api/**'

diff --git a/.github/workflows/hive-ci.yml b/.github/workflows/hive-ci.yml
@@ -51,7 +51,6 @@ on:
     - 'spark/**'
     - 'flink/**'
     - 'kafka-connect/**'
-    - 'pig/**'
     - 'docs/**'
     - 'site/**'
     - 'open-api/**'

diff --git a/.github/workflows/kafka-connect-ci.yml b/.github/workflows/kafka-connect-ci.yml
@@ -53,7 +53,6 @@ on:
     - 'hive3-orc-bundle/**'
     - 'hive-runtime/**'
     - 'spark/**'
-    - 'pig/**'
     - 'docs/**'
     - 'site/**'
     - 'open-api/**'

diff --git a/.github/workflows/publish-docker.yml b/.github/workflows/publish-docker.yml
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: Build and Push Docker Image
+
+on:
+  push:
+    tags:
+      - 'apache-iceberg-[0-9]+.[0-9]+.[0-9]+'
+  workflow_dispatch:
+
+env:
+  DOCKER_IMAGE_TAG: iceberg-rest-fixture
+  DOCKER_IMAGE_VERSION: latest
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-java@v4
+      with:
+        distribution: zulu
+        java-version: 21
+    - name: Build Iceberg Open API project
+      run: ./gradlew :iceberg-open-api:shadowJar
+    - name: Login to Docker Hub
+      run: |
+        docker login -u ${{ secrets.DOCKERHUB_USER }} -p ${{ secrets.DOCKERHUB_TOKEN }}
+    - name: Set the tagged version
+      # for tag 'apache-iceberg-1.7.0', publish image 'apache/iceberg-rest-fixture:1.7.1'
+      if: github.event_name == 'push' && contains(github.ref, 'refs/tags/')
+      run: |
+        echo "DOCKER_IMAGE_VERSION=`echo ${{ github.ref }} | tr -d -c 0-9.`" >> "$GITHUB_ENV"
+    - name: Build Docker Image
+      run: docker build -t ${{ secrets.DOCKERHUB_USER }}/$DOCKER_IMAGE_TAG:$DOCKER_IMAGE_VERSION -f docker/iceberg-rest-adapter-image/Dockerfile .
+    - name: Push Docker Image
+      run: |
+        docker push ${{ secrets.DOCKERHUB_USER }}/$DOCKER_IMAGE_TAG:$DOCKER_IMAGE_VERSION
diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml
@@ -54,7 +54,6 @@ on:
     - 'hive-runtime/**'
     - 'flink/**'
     - 'kafka-connect/**'
-    - 'pig/**'
     - 'docs/**'
     - 'open-api/**'
     - 'format/**'

diff --git a/README.md b/README.md
@@ -74,7 +74,6 @@ Iceberg also has modules for adding Iceberg support to processing engines:
 * `iceberg-spark` is an implementation of Spark's Datasource V2 API for Iceberg with submodules for each spark versions (use runtime jars for a shaded version)
 * `iceberg-flink` contains classes for integrating with Apache Flink (use iceberg-flink-runtime for a shaded version)
 * `iceberg-mr` contains an InputFormat and other classes for integrating with Apache Hive
-* `iceberg-pig` is an implementation of Pig's LoadFunc API for Iceberg
 
 ---
 **NOTE**

diff --git a/api/src/main/java/org/apache/iceberg/AddedRowsScanTask.java b/api/src/main/java/org/apache/iceberg/AddedRowsScanTask.java
@@ -19,6 +19,7 @@
 package org.apache.iceberg;
 
 import java.util.List;
+import org.apache.iceberg.util.ScanTaskUtil;
 
 /**
  * A scan task for inserts generated by adding a data file to the table.
@@ -55,7 +56,7 @@ default ChangelogOperation operation() {
 
   @Override
   default long sizeBytes() {
-    return length() + deletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum();
+    return length() + ScanTaskUtil.contentSizeInBytes(deletes());
   }
 
   @Override

diff --git a/api/src/main/java/org/apache/iceberg/DataFile.java b/api/src/main/java/org/apache/iceberg/DataFile.java
@@ -104,12 +104,21 @@ public interface DataFile extends ContentFile<DataFile> {
           "referenced_data_file",
           StringType.get(),
           "Fully qualified location (URI with FS scheme) of a data file that all deletes reference");
+  Types.NestedField CONTENT_OFFSET =
+      optional(
+          144, "content_offset", LongType.get(), "The offset in the file where the content starts");
+  Types.NestedField CONTENT_SIZE =
+      optional(
+          145,
+          "content_size_in_bytes",
+          LongType.get(),
+          "The length of referenced content stored in the file");
 
   int PARTITION_ID = 102;
   String PARTITION_NAME = "partition";
   String PARTITION_DOC = "Partition data tuple, schema based on the partition spec";
 
-  // NEXT ID TO ASSIGN: 144
+  // NEXT ID TO ASSIGN: 146
 
   static StructType getType(StructType partitionType) {
     // IDs start at 100 to leave room for changes to ManifestEntry
@@ -131,7 +140,9 @@ static StructType getType(StructType partitionType) {
         SPLIT_OFFSETS,
         EQUALITY_IDS,
         SORT_ORDER_ID,
-        REFERENCED_DATA_FILE);
+        REFERENCED_DATA_FILE,
+        CONTENT_OFFSET,
+        CONTENT_SIZE);
   }
 
   /**

diff --git a/api/src/main/java/org/apache/iceberg/DeleteFile.java b/api/src/main/java/org/apache/iceberg/DeleteFile.java
@@ -42,4 +42,26 @@ default List<Long> splitOffsets() {
   default String referencedDataFile() {
     return null;
   }
+
+  /**
+   * Returns the offset in the file where the content starts.
+   *
+   * <p>The content offset is required for deletion vectors and points to the start of the deletion
+   * vector blob in the Puffin file, enabling direct access. This method always returns null for
+   * equality and position delete files.
+   */
+  default Long contentOffset() {
+    return null;
+  }
+
+  /**
+   * Returns the length of referenced content stored in the file.
+   *
+   * <p>The content size is required for deletion vectors and indicates the size of the deletion
+   * vector blob in the Puffin file, enabling direct access. This method always returns null for
+   * equality and position delete files.
+   */
+  default Long contentSizeInBytes() {
+    return null;
+  }
 }
diff --git a/api/src/main/java/org/apache/iceberg/DeleteFiles.java b/api/src/main/java/org/apache/iceberg/DeleteFiles.java
@@ -51,7 +51,7 @@ public interface DeleteFiles extends SnapshotUpdate<DeleteFiles> {
    * @return this for method chaining
    */
   default DeleteFiles deleteFile(DataFile file) {
-    deleteFile(file.path());
+    deleteFile(file.location());
     return this;
   }
 

diff --git a/api/src/main/java/org/apache/iceberg/DeletedDataFileScanTask.java b/api/src/main/java/org/apache/iceberg/DeletedDataFileScanTask.java
@@ -19,6 +19,7 @@
 package org.apache.iceberg;
 
 import java.util.List;
+import org.apache.iceberg.util.ScanTaskUtil;
 
 /**
  * A scan task for deletes generated by removing a data file from the table.
@@ -54,7 +55,7 @@ default ChangelogOperation operation() {
 
   @Override
   default long sizeBytes() {
-    return length() + existingDeletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum();
+    return length() + ScanTaskUtil.contentSizeInBytes(existingDeletes());
   }
 
   @Override

diff --git a/api/src/main/java/org/apache/iceberg/DeletedRowsScanTask.java b/api/src/main/java/org/apache/iceberg/DeletedRowsScanTask.java
@@ -19,6 +19,7 @@
 package org.apache.iceberg;
 
 import java.util.List;
+import org.apache.iceberg.util.ScanTaskUtil;
 
 /**
  * A scan task for deletes generated by adding delete files to the table.
@@ -63,9 +64,9 @@ default ChangelogOperation operation() {
 
   @Override
   default long sizeBytes() {
-    return length()
-        + addedDeletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum()
-        + existingDeletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum();
+    long addedDeletesSize = ScanTaskUtil.contentSizeInBytes(addedDeletes());
+    long existingDeletesSize = ScanTaskUtil.contentSizeInBytes(existingDeletes());
+    return length() + addedDeletesSize + existingDeletesSize;
   }
 
   @Override

diff --git a/api/src/main/java/org/apache/iceberg/FileFormat.java b/api/src/main/java/org/apache/iceberg/FileFormat.java
@@ -24,6 +24,7 @@
 
 /** Enum of supported file formats. */
 public enum FileFormat {
+  PUFFIN("puffin", false),
   ORC("orc", true),
   PARQUET("parquet", true),
   AVRO("avro", true),

diff --git a/api/src/main/java/org/apache/iceberg/FileScanTask.java b/api/src/main/java/org/apache/iceberg/FileScanTask.java
@@ -19,6 +19,7 @@
 package org.apache.iceberg;
 
 import java.util.List;
+import org.apache.iceberg.util.ScanTaskUtil;
 
 /** A scan task over a range of bytes in a single data file. */
 public interface FileScanTask extends ContentScanTask<DataFile>, SplittableScanTask<FileScanTask> {
@@ -36,7 +37,7 @@ default Schema schema() {
 
   @Override
   default long sizeBytes() {
-    return length() + deletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum();
+    return length() + ScanTaskUtil.contentSizeInBytes(deletes());
   }
 
   @Override

diff --git a/api/src/main/java/org/apache/iceberg/ManifestFile.java b/api/src/main/java/org/apache/iceberg/ManifestFile.java
@@ -49,7 +49,7 @@ public interface ManifestFile {
           Types.LongType.get(),
           "Lowest sequence number in the manifest");
   Types.NestedField SNAPSHOT_ID =
-      optional(
+      required(
           503, "added_snapshot_id", Types.LongType.get(), "Snapshot ID that added the manifest");
   Types.NestedField ADDED_FILES_COUNT =
       optional(504, "added_files_count", Types.IntegerType.get(), "Added entry count");

diff --git a/api/src/main/java/org/apache/iceberg/Schema.java b/api/src/main/java/org/apache/iceberg/Schema.java
@@ -28,6 +28,7 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.stream.Collectors;
+import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Joiner;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.relocated.com.google.common.collect.BiMap;
@@ -54,9 +55,12 @@ public class Schema implements Serializable {
   private static final Joiner NEWLINE = Joiner.on('\n');
   private static final String ALL_COLUMNS = "*";
   private static final int DEFAULT_SCHEMA_ID = 0;
-  private static final int DEFAULT_VALUES_MIN_FORMAT_VERSION = 3;
-  private static final Map<Type.TypeID, Integer> MIN_FORMAT_VERSIONS =
-      ImmutableMap.of(Type.TypeID.TIMESTAMP_NANO, 3);
+
+  @VisibleForTesting static final int DEFAULT_VALUES_MIN_FORMAT_VERSION = 3;
+
+  @VisibleForTesting
+  static final Map<Type.TypeID, Integer> MIN_FORMAT_VERSIONS =
+      ImmutableMap.of(Type.TypeID.TIMESTAMP_NANO, 3, Type.TypeID.VARIANT, 3);
 
   private final StructType struct;
   private final int schemaId;

diff --git a/api/src/main/java/org/apache/iceberg/UpdateSchema.java b/api/src/main/java/org/apache/iceberg/UpdateSchema.java
@@ -369,7 +369,9 @@ default UpdateSchema updateColumn(String name, Type.PrimitiveType newType, Strin
    * to create a union schema.
    *
    * <p>For fields with same canonical names in both schemas it is required that the widen types is
-   * supported using {@link UpdateSchema#updateColumn(String, Type.PrimitiveType)}
+   * supported using {@link UpdateSchema#updateColumn(String, Type.PrimitiveType)}. Differences in
+   * type are ignored if the new type is narrower than the existing type (e.g. long to int, double
+   * to float).
    *
    * <p>Only supports turning a previously required field into an optional one if it is marked
    * optional in the provided new schema using {@link UpdateSchema#makeColumnOptional(String)}

diff --git a/api/src/main/java/org/apache/iceberg/encryption/EncryptingFileIO.java b/api/src/main/java/org/apache/iceberg/encryption/EncryptingFileIO.java
@@ -93,10 +93,9 @@ public InputFile newInputFile(DeleteFile file) {
 
   private InputFile newInputFile(ContentFile<?> file) {
     if (file.keyMetadata() != null) {
-      return newDecryptingInputFile(
-          file.path().toString(), file.fileSizeInBytes(), file.keyMetadata());
+      return newDecryptingInputFile(file.location(), file.fileSizeInBytes(), file.keyMetadata());
     } else {
-      return newInputFile(file.path().toString(), file.fileSizeInBytes());
+      return newInputFile(file.location(), file.fileSizeInBytes());
     }
   }
 
@@ -148,7 +147,7 @@ public void close() {
   }
 
   private SimpleEncryptedInputFile wrap(ContentFile<?> file) {
-    InputFile encryptedInputFile = io.newInputFile(file.path().toString(), file.fileSizeInBytes());
+    InputFile encryptedInputFile = io.newInputFile(file.location(), file.fileSizeInBytes());
     return new SimpleEncryptedInputFile(encryptedInputFile, toKeyMetadata(file.keyMetadata()));
   }
 

diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java
@@ -337,9 +337,9 @@ public <T> Expression predicate(UnboundPredicate<T> pred) {
               pred.op(), pred.term(), (T) sanitize(pred.literal(), now, today));
         case IN:
         case NOT_IN:
-          Iterable<String> iter =
-              () -> pred.literals().stream().map(lit -> sanitize(lit, now, today)).iterator();
-          return new UnboundPredicate<>(pred.op(), pred.term(), (Iterable<T>) iter);
+          Iterable<T> iter =
+              () -> pred.literals().stream().map(lit -> (T) sanitize(lit, now, today)).iterator();
+          return new UnboundPredicate<>(pred.op(), pred.term(), iter);
         default:
           throw new UnsupportedOperationException(
               "Cannot sanitize unsupported predicate type: " + pred.op());
@@ -534,7 +534,8 @@ private static String sanitize(Type type, Object value, long now, int today) {
       case DECIMAL:
       case FIXED:
       case BINARY:
-        // for boolean, uuid, decimal, fixed, and binary, match the string result
+      case VARIANT:
+        // for boolean, uuid, decimal, fixed, variant, and binary, match the string result
         return sanitizeSimpleString(value.toString());
     }
     throw new UnsupportedOperationException(
@@ -562,7 +563,7 @@ private static String sanitize(Literal<?> literal, long now, int today) {
     } else if (literal instanceof Literals.DoubleLiteral) {
       return sanitizeNumber(((Literals.DoubleLiteral) literal).value(), "float");
     } else {
-      // for uuid, decimal, fixed, and binary, match the string result
+      // for uuid, decimal, fixed, variant, and binary, match the string result
       return sanitizeSimpleString(literal.value().toString());
     }
   }