Skip to content

Commit

Permalink
Restructure the code to be extensible to different query modes, table…
Browse files Browse the repository at this point in the history
… types, file formats, etc. (trinodb#14)

* Restructure the code to be extensible to different query modes, table types, file formats, etc.

* Fix issues in getting splits

* Fix concurrent read of partition info and remove redundant logging

* Adjust logging

* Address review comments
  • Loading branch information
yihua authored Jan 7, 2022
1 parent 033addd commit ba0bca8
Show file tree
Hide file tree
Showing 24 changed files with 1,807 additions and 555 deletions.
71 changes: 67 additions & 4 deletions plugin/trino-hudi/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@
<groupId>io.airlift</groupId>
<artifactId>log</artifactId>
</dependency>
<dependency>
<groupId>io.airlift</groupId>
<artifactId>units</artifactId>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
Expand Down Expand Up @@ -199,6 +195,73 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hive-sync</artifactId>
<version>${dep.hudi.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hive</groupId>
<artifactId>hive-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hive</groupId>
<artifactId>hive-metastore</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hive</groupId>
<artifactId>hive-service</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-sync-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
</exclusion>
<exclusion>
<groupId>servletapi</groupId>
<artifactId>servletapi</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.weakref</groupId>
<artifactId>jmxutils</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

import io.airlift.configuration.Config;
import io.airlift.configuration.ConfigDescription;
import io.airlift.units.DataSize;
import org.apache.hudi.common.model.HoodieFileFormat;

import javax.validation.constraints.NotNull;
Expand All @@ -28,7 +27,9 @@ public class HudiConfig
private HoodieFileFormat fileFormat = PARQUET;
private boolean metadataEnabled;
private boolean shouldSkipMetaStoreForPartition = true;
private DataSize maxSplitSize = DataSize.ofBytes(128 * 1024 * 1024);
private boolean shouldUseParquetColumnNames = true;
private int partitionScannerParallelism = 4;
private int splitGeneratorParallelism = 4;

@NotNull
public HoodieFileFormat getFileFormat()
Expand Down Expand Up @@ -57,30 +58,60 @@ public boolean isMetadataEnabled()
return this.metadataEnabled;
}

@Config("hudi.max-split-size")
public HudiConfig setMaxSplitSize(DataSize size)
@Config("hudi.skip-metastore-for-partition")
@ConfigDescription("Whether to skip metastore for partition")
public HudiConfig setSkipMetaStoreForPartition(boolean shouldSkipMetaStoreForPartition)
{
this.maxSplitSize = size;
this.shouldSkipMetaStoreForPartition = shouldSkipMetaStoreForPartition;
return this;
}

@NotNull
public DataSize getMaxSplitSize()
public boolean getSkipMetaStoreForPartition()
{
return this.maxSplitSize;
return this.shouldSkipMetaStoreForPartition;
}

@Config("hudi.skip-metastore-for-partition")
@ConfigDescription("Whether to skip metastore for partition")
public HudiConfig setSkipMetaStoreForPartition(boolean shouldSkipMetaStoreForPartition)
@Config("hudi.use-parquet-column-names")
@ConfigDescription("Whether to use column names from parquet files. "
+ "Only applicable to parquet file format.")
public HudiConfig setUseParquetColumnNames(boolean shouldUseParquetColumnNames)
{
this.shouldSkipMetaStoreForPartition = shouldSkipMetaStoreForPartition;
this.shouldUseParquetColumnNames = shouldUseParquetColumnNames;
return this;
}

@NotNull
public boolean getSkipMetaStoreForPartition()
public boolean getUseParquetColumnNames()
{
return this.shouldSkipMetaStoreForPartition;
return this.shouldUseParquetColumnNames;
}

@Config("hudi.partition-scanner-parallelism")
@ConfigDescription("Number of threads to use for partition scanners")
public HudiConfig setPartitionScannerParallelism(int partitionScannerParallelism)
{
this.partitionScannerParallelism = partitionScannerParallelism;
return this;
}

@NotNull
public int getPartitionScannerParallelism()
{
return this.partitionScannerParallelism;
}

@Config("hudi.split-generator-parallelism")
@ConfigDescription("Number of threads to use for split generators")
public HudiConfig setSplitGeneratorParallelism(int splitGeneratorParallelism)
{
this.splitGeneratorParallelism = splitGeneratorParallelism;
return this;
}

@NotNull
public int getSplitGeneratorParallelism()
{
return this.splitGeneratorParallelism;
}
}
Loading

0 comments on commit ba0bca8

Please sign in to comment.