Restructure the code to be extensible to different query modes, table…

… types, file formats, etc. (trinodb#14) * Restructure the code to be extensible to different query modes, table types, file formats, etc. * Fix issues in getting splits * Fix concurrent read of partition info and remove redundant logging * Adjust logging * Address review comments
yihua · Jan 7, 2022 · ba0bca8 · ba0bca8
1 parent 033addd
commit ba0bca8
Show file tree

Hide file tree

Showing 24 changed files with 1,807 additions and 555 deletions.
diff --git a/plugin/trino-hudi/pom.xml b/plugin/trino-hudi/pom.xml
@@ -65,10 +65,6 @@
             <groupId>io.airlift</groupId>
             <artifactId>log</artifactId>
         </dependency>
-        <dependency>
-            <groupId>io.airlift</groupId>
-            <artifactId>units</artifactId>
-        </dependency>
         <dependency>
             <groupId>com.google.guava</groupId>
             <artifactId>guava</artifactId>
@@ -199,6 +195,73 @@
                 </exclusion>
             </exclusions>
         </dependency>
+        <dependency>
+            <groupId>org.apache.hudi</groupId>
+            <artifactId>hudi-hive-sync</artifactId>
+            <version>${dep.hudi.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.apache.hadoop</groupId>
+                    <artifactId>hadoop-common</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.hadoop</groupId>
+                    <artifactId>hadoop-client</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.hadoop</groupId>
+                    <artifactId>hadoop-hdfs</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.hadoop</groupId>
+                    <artifactId>hadoop-auth</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.hive</groupId>
+                    <artifactId>hive-common</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.hive</groupId>
+                    <artifactId>hive-jdbc</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.hive</groupId>
+                    <artifactId>hive-metastore</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.hive</groupId>
+                    <artifactId>hive-service</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.hudi</groupId>
+                    <artifactId>hudi-common</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.hudi</groupId>
+                    <artifactId>hudi-hadoop-mr</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.hudi</groupId>
+                    <artifactId>hudi-sync-common</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.parquet</groupId>
+                    <artifactId>parquet-avro</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>log4j</groupId>
+                    <artifactId>log4j</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>com.beust</groupId>
+                    <artifactId>jcommander</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>servletapi</groupId>
+                    <artifactId>servletapi</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
         <dependency>
             <groupId>org.weakref</groupId>
             <artifactId>jmxutils</artifactId>

diff --git a/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConfig.java b/plugin/trino-hudi/src/main/java/io/trino/plugin/hudi/HudiConfig.java
@@ -16,7 +16,6 @@
 
 import io.airlift.configuration.Config;
 import io.airlift.configuration.ConfigDescription;
-import io.airlift.units.DataSize;
 import org.apache.hudi.common.model.HoodieFileFormat;
 
 import javax.validation.constraints.NotNull;
@@ -28,7 +27,9 @@ public class HudiConfig
     private HoodieFileFormat fileFormat = PARQUET;
     private boolean metadataEnabled;
     private boolean shouldSkipMetaStoreForPartition = true;
-    private DataSize maxSplitSize = DataSize.ofBytes(128 * 1024 * 1024);
+    private boolean shouldUseParquetColumnNames = true;
+    private int partitionScannerParallelism = 4;
+    private int splitGeneratorParallelism = 4;
 
     @NotNull
     public HoodieFileFormat getFileFormat()
@@ -57,30 +58,60 @@ public boolean isMetadataEnabled()
         return this.metadataEnabled;
     }
 
-    @Config("hudi.max-split-size")
-    public HudiConfig setMaxSplitSize(DataSize size)
+    @Config("hudi.skip-metastore-for-partition")
+    @ConfigDescription("Whether to skip metastore for partition")
+    public HudiConfig setSkipMetaStoreForPartition(boolean shouldSkipMetaStoreForPartition)
     {
-        this.maxSplitSize = size;
+        this.shouldSkipMetaStoreForPartition = shouldSkipMetaStoreForPartition;
         return this;
     }
 
     @NotNull
-    public DataSize getMaxSplitSize()
+    public boolean getSkipMetaStoreForPartition()
     {
-        return this.maxSplitSize;
+        return this.shouldSkipMetaStoreForPartition;
     }
 
-    @Config("hudi.skip-metastore-for-partition")
-    @ConfigDescription("Whether to skip metastore for partition")
-    public HudiConfig setSkipMetaStoreForPartition(boolean shouldSkipMetaStoreForPartition)
+    @Config("hudi.use-parquet-column-names")
+    @ConfigDescription("Whether to use column names from parquet files.  "
+            + "Only applicable to parquet file format.")
+    public HudiConfig setUseParquetColumnNames(boolean shouldUseParquetColumnNames)
     {
-        this.shouldSkipMetaStoreForPartition = shouldSkipMetaStoreForPartition;
+        this.shouldUseParquetColumnNames = shouldUseParquetColumnNames;
         return this;
     }
 
     @NotNull
-    public boolean getSkipMetaStoreForPartition()
+    public boolean getUseParquetColumnNames()
     {
-        return this.shouldSkipMetaStoreForPartition;
+        return this.shouldUseParquetColumnNames;
+    }
+
+    @Config("hudi.partition-scanner-parallelism")
+    @ConfigDescription("Number of threads to use for partition scanners")
+    public HudiConfig setPartitionScannerParallelism(int partitionScannerParallelism)
+    {
+        this.partitionScannerParallelism = partitionScannerParallelism;
+        return this;
+    }
+
+    @NotNull
+    public int getPartitionScannerParallelism()
+    {
+        return this.partitionScannerParallelism;
+    }
+
+    @Config("hudi.split-generator-parallelism")
+    @ConfigDescription("Number of threads to use for split generators")
+    public HudiConfig setSplitGeneratorParallelism(int splitGeneratorParallelism)
+    {
+        this.splitGeneratorParallelism = splitGeneratorParallelism;
+        return this;
+    }
+
+    @NotNull
+    public int getSplitGeneratorParallelism()
+    {
+        return this.splitGeneratorParallelism;
     }
 }