From b5cc4cc9f7476c32798c00fc6736b2ed23f60457 Mon Sep 17 00:00:00 2001 From: cai can <94670132+caican00@users.noreply.github.com> Date: Mon, 21 Oct 2024 14:53:40 +0800 Subject: [PATCH] [#3919] feat(catalog-lakehouse-paimon): Support hive backend for Paimon Catalog (#5092) ### What changes were proposed in this pull request? Support hive backend for Paimon Catalog ### Why are the changes needed? Fix: https://github.com/apache/gravitino/issues/3919 ### Does this PR introduce _any_ user-facing change? will add doc in a later pr. ### How was this patch tested? new UT and IT. --------- Co-authored-by: caican --- .../catalog-lakehouse-paimon/build.gradle.kts | 34 +++++++- .../paimon/PaimonCatalogBackend.java | 3 +- .../integration/test/CatalogPaimonBaseIT.java | 13 ++- .../integration/test/CatalogPaimonHiveIT.java | 83 +++++++++++++++++++ .../paimon/utils/TestCatalogUtils.java | 23 ++++- docs/lakehouse-paimon-catalog.md | 8 +- 6 files changed, 154 insertions(+), 10 deletions(-) create mode 100644 catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/integration/test/CatalogPaimonHiveIT.java diff --git a/catalogs/catalog-lakehouse-paimon/build.gradle.kts b/catalogs/catalog-lakehouse-paimon/build.gradle.kts index 16a3382cfc5..6839cc163bc 100644 --- a/catalogs/catalog-lakehouse-paimon/build.gradle.kts +++ b/catalogs/catalog-lakehouse-paimon/build.gradle.kts @@ -46,10 +46,9 @@ dependencies { exclude("com.sun.jersey") exclude("javax.servlet") exclude("org.apache.curator") - exclude("org.apache.hive") exclude("org.apache.hbase") exclude("org.apache.zookeeper") - exclude("org.eclipse.jetty.aggregate:jetty-all") + exclude("org.eclipse.jetty.aggregate") exclude("org.mortbay.jetty") exclude("org.mortbay.jetty:jetty") exclude("org.mortbay.jetty:jetty-util") @@ -67,9 +66,40 @@ dependencies { exclude("org.apache.parquet:parquet-encoding") exclude("org.apache.parquet:parquet-common") exclude("org.apache.parquet:parquet-hadoop") + exclude("org.apache.parquet:parquet-hadoop-bundle") exclude("org.apache.paimon:paimon-codegen-loader") exclude("org.apache.paimon:paimon-shade-caffeine-2") exclude("org.apache.paimon:paimon-shade-guava-30") + exclude("org.apache.hive:hive-service-rpc") + exclude("org.apache.logging.log4j") + exclude("com.google.guava") + exclude("commons-lang") + exclude("org.slf4j") + exclude("org.apache.orc") + exclude("org.apache.httpcomponents") + exclude("jline") + exclude("org.eclipse.jetty.orbit") + exclude("org.apache.ant") + exclude("com.tdunning") + exclude("io.dropwizard.metrics") + exclude("com.github.joshelser") + exclude("commons-codec") + exclude("commons-cli") + exclude("tomcat") + exclude("org.apache.avro") + exclude("net.sf.opencsv") + exclude("javolution") + exclude("com.jolbox") + exclude("com.zaxxer") + exclude("org.apache.derby") + exclude("org.datanucleus") + exclude("commons-pool") + exclude("commons-dbcp") + exclude("javax.jdo") + exclude("org.antlr") + exclude("co.cask.tephra") + exclude("com.google.code.findbugs") + exclude("com.github.spotbugs") } implementation(libs.bundles.log4j) implementation(libs.commons.lang3) diff --git a/catalogs/catalog-lakehouse-paimon/src/main/java/org/apache/gravitino/catalog/lakehouse/paimon/PaimonCatalogBackend.java b/catalogs/catalog-lakehouse-paimon/src/main/java/org/apache/gravitino/catalog/lakehouse/paimon/PaimonCatalogBackend.java index 355a79f5850..7371c5be36f 100644 --- a/catalogs/catalog-lakehouse-paimon/src/main/java/org/apache/gravitino/catalog/lakehouse/paimon/PaimonCatalogBackend.java +++ b/catalogs/catalog-lakehouse-paimon/src/main/java/org/apache/gravitino/catalog/lakehouse/paimon/PaimonCatalogBackend.java @@ -21,5 +21,6 @@ /** The type of Apache Paimon catalog backend. */ public enum PaimonCatalogBackend { FILESYSTEM, - JDBC + JDBC, + HIVE } diff --git a/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/integration/test/CatalogPaimonBaseIT.java b/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/integration/test/CatalogPaimonBaseIT.java index ed90745a785..19bbde3315b 100644 --- a/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/integration/test/CatalogPaimonBaseIT.java +++ b/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/integration/test/CatalogPaimonBaseIT.java @@ -99,6 +99,7 @@ public abstract class CatalogPaimonBaseIT extends BaseIT { protected String jdbcPassword; protected Catalog catalog; protected org.apache.paimon.catalog.Catalog paimonCatalog; + protected SparkSession spark; protected String metalakeName = GravitinoITUtils.genRandomName("paimon_it_metalake"); protected String catalogName = GravitinoITUtils.genRandomName("paimon_it_catalog"); protected String schemaName = GravitinoITUtils.genRandomName("paimon_it_schema"); @@ -115,8 +116,8 @@ public abstract class CatalogPaimonBaseIT extends BaseIT { private static final String alertTableName = "alert_table_name"; private static String INSERT_BATCH_WITHOUT_PARTITION_TEMPLATE = "INSERT INTO paimon.%s VALUES %s"; private static final String SELECT_ALL_TEMPLATE = "SELECT * FROM paimon.%s"; + private static final String DEFAULT_DB = "default"; private GravitinoMetalake metalake; - protected SparkSession spark; private Map catalogProperties; @BeforeAll @@ -727,7 +728,7 @@ public void testAlterPaimonTable() { // update column position Column col1 = Column.of("name", Types.StringType.get(), "comment"); Column col2 = Column.of("address", Types.StringType.get(), "comment"); - Column col3 = Column.of("date_of_birth", Types.DateType.get(), "comment"); + Column col3 = Column.of("date_of_birth", Types.StringType.get(), "comment"); Column[] newColumns = new Column[] {col1, col2, col3}; NameIdentifier tableIdentifier = @@ -874,7 +875,13 @@ void testOperationDataOfPaimonTable() { private void clearTableAndSchema() { SupportsSchemas supportsSchema = catalog.asSchemas(); Arrays.stream(supportsSchema.listSchemas()) - .forEach(schema -> supportsSchema.dropSchema(schema, true)); + .forEach( + schema -> { + // can not drop default database for hive backend. + if (!DEFAULT_DB.equalsIgnoreCase(schema)) { + supportsSchema.dropSchema(schema, true); + } + }); } private void createMetalake() { diff --git a/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/integration/test/CatalogPaimonHiveIT.java b/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/integration/test/CatalogPaimonHiveIT.java new file mode 100644 index 00000000000..fcb220a8806 --- /dev/null +++ b/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/integration/test/CatalogPaimonHiveIT.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.catalog.lakehouse.paimon.integration.test; + +import com.google.common.collect.Maps; +import java.util.Map; +import org.apache.gravitino.NameIdentifier; +import org.apache.gravitino.Schema; +import org.apache.gravitino.SupportsSchemas; +import org.apache.gravitino.catalog.lakehouse.paimon.PaimonCatalogPropertiesMetadata; +import org.apache.gravitino.integration.test.container.HiveContainer; +import org.apache.gravitino.integration.test.util.GravitinoITUtils; +import org.apache.paimon.catalog.Catalog; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; + +@Tag("gravitino-docker-test") +@TestInstance(TestInstance.Lifecycle.PER_CLASS) +public class CatalogPaimonHiveIT extends CatalogPaimonBaseIT { + + @Override + protected Map initPaimonCatalogProperties() { + Map catalogProperties = Maps.newHashMap(); + catalogProperties.put("key1", "val1"); + catalogProperties.put("key2", "val2"); + + TYPE = "hive"; + WAREHOUSE = + String.format( + "hdfs://%s:%d/user/hive/warehouse-catalog-paimon/", + containerSuite.getHiveContainer().getContainerIpAddress(), + HiveContainer.HDFS_DEFAULTFS_PORT); + URI = + String.format( + "thrift://%s:%d", + containerSuite.getHiveContainer().getContainerIpAddress(), + HiveContainer.HIVE_METASTORE_PORT); + + catalogProperties.put(PaimonCatalogPropertiesMetadata.GRAVITINO_CATALOG_BACKEND, TYPE); + catalogProperties.put(PaimonCatalogPropertiesMetadata.WAREHOUSE, WAREHOUSE); + catalogProperties.put(PaimonCatalogPropertiesMetadata.URI, URI); + + return catalogProperties; + } + + @Test + void testPaimonSchemaProperties() throws Catalog.DatabaseNotExistException { + SupportsSchemas schemas = catalog.asSchemas(); + + // create schema check. + String testSchemaName = GravitinoITUtils.genRandomName("test_schema_1"); + NameIdentifier schemaIdent = NameIdentifier.of(metalakeName, catalogName, testSchemaName); + Map schemaProperties = Maps.newHashMap(); + schemaProperties.put("key", "hive"); + Schema createdSchema = + schemas.createSchema(schemaIdent.name(), schema_comment, schemaProperties); + Assertions.assertEquals(createdSchema.properties().get("key"), "hive"); + + // load schema check. + Schema schema = schemas.loadSchema(schemaIdent.name()); + Assertions.assertEquals(schema.properties().get("key"), "hive"); + Map loadedProps = paimonCatalog.loadDatabaseProperties(schemaIdent.name()); + Assertions.assertEquals(loadedProps.get("key"), "hive"); + } +} diff --git a/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/utils/TestCatalogUtils.java b/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/utils/TestCatalogUtils.java index e8fe66551ba..d1b50d52073 100644 --- a/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/utils/TestCatalogUtils.java +++ b/catalogs/catalog-lakehouse-paimon/src/test/java/org/apache/gravitino/catalog/lakehouse/paimon/utils/TestCatalogUtils.java @@ -29,9 +29,12 @@ import java.util.function.Consumer; import org.apache.gravitino.catalog.lakehouse.paimon.PaimonCatalogBackend; import org.apache.gravitino.catalog.lakehouse.paimon.PaimonConfig; +import org.apache.gravitino.integration.test.container.ContainerSuite; +import org.apache.gravitino.integration.test.container.HiveContainer; import org.apache.paimon.catalog.Catalog; import org.apache.paimon.catalog.FileSystemCatalog; import org.apache.paimon.factories.FactoryException; +import org.apache.paimon.hive.HiveCatalog; import org.apache.paimon.jdbc.JdbcCatalog; import org.junit.jupiter.api.Test; @@ -44,6 +47,8 @@ void testLoadCatalogBackend() throws Exception { assertCatalog(PaimonCatalogBackend.FILESYSTEM.name(), FileSystemCatalog.class); // Test load JdbcCatalog for jdbc metastore. assertCatalog(PaimonCatalogBackend.JDBC.name(), JdbcCatalog.class); + // Test load HiveCatalog for hive metastore. + assertCatalog(PaimonCatalogBackend.HIVE.name(), HiveCatalog.class); // Test load catalog exception for other metastore. assertThrowsExactly(FactoryException.class, () -> assertCatalog("other", catalog -> {})); } @@ -66,7 +71,7 @@ private void assertCatalog(String metastore, Consumer consumer) throws System.getProperty("java.io.tmpdir"), "paimon_catalog_warehouse"), PaimonConfig.CATALOG_URI.getKey(), - "jdbc:h2:mem:testdb", + generateUri(metastore), PaimonConfig.CATALOG_JDBC_USER.getKey(), "user", PaimonConfig.CATALOG_JDBC_PASSWORD.getKey(), @@ -75,4 +80,20 @@ private void assertCatalog(String metastore, Consumer consumer) throws consumer.accept(catalog); } } + + private static String generateUri(String metastore) { + String uri = "uri"; + if (PaimonCatalogBackend.JDBC.name().equalsIgnoreCase(metastore)) { + uri = "jdbc:h2:mem:testdb"; + } else if (PaimonCatalogBackend.HIVE.name().equalsIgnoreCase(metastore)) { + ContainerSuite containerSuite = ContainerSuite.getInstance(); + containerSuite.startHiveContainer(); + uri = + String.format( + "thrift://%s:%d", + containerSuite.getHiveContainer().getContainerIpAddress(), + HiveContainer.HIVE_METASTORE_PORT); + } + return uri; + } } diff --git a/docs/lakehouse-paimon-catalog.md b/docs/lakehouse-paimon-catalog.md index 4c336f3d323..14595f06cbf 100644 --- a/docs/lakehouse-paimon-catalog.md +++ b/docs/lakehouse-paimon-catalog.md @@ -22,17 +22,16 @@ Builds with Apache Paimon `0.8.0`. ### Catalog capabilities -- Works as a catalog proxy, supporting `FilesystemCatalog` and `JdbcCatalog`. +- Works as a catalog proxy, supporting `FilesystemCatalog`, `JdbcCatalog` and `HiveCatalog`. - Supports DDL operations for Paimon schemas and tables. -- Doesn't support `HiveCatalog` catalog backend now. - Doesn't support alterSchema. ### Catalog properties | Property name | Description | Default value | Required | Since Version | |----------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------------------------------------------|-------------------| -| `catalog-backend` | Catalog backend of Gravitino Paimon catalog. Supports `filesystem` and `jdbc` now. | (none) | Yes | 0.6.0-incubating | +| `catalog-backend` | Catalog backend of Gravitino Paimon catalog. Supports `filesystem`, `jdbc` and `hive`. | (none) | Yes | 0.6.0-incubating | | `uri` | The URI configuration of the Paimon catalog. `thrift://127.0.0.1:9083` or `jdbc:postgresql://127.0.0.1:5432/db_name` or `jdbc:mysql://127.0.0.1:3306/metastore_db`. It is optional for `FilesystemCatalog`. | (none) | required if the value of `catalog-backend` is not `filesystem`. | 0.6.0-incubating | | `warehouse` | Warehouse directory of catalog. `file:///user/hive/warehouse-paimon/` for local fs, `hdfs://namespace/hdfs/path` for HDFS , `s3://{bucket-name}/path/` for S3 or `oss://{bucket-name}/path` for Aliyun OSS | (none) | Yes | 0.6.0-incubating | | `authentication.type` | The type of authentication for Paimon catalog backend, currently Gravitino only supports `Kerberos` and `simple`. | `simple` | No | 0.6.0-incubating | @@ -51,6 +50,9 @@ Builds with Apache Paimon `0.8.0`. If you want to use the `oss` or `s3` warehouse, you need to place related jars in the `catalogs/lakehouse-paimon/lib` directory, more information can be found in the [Paimon S3](https://paimon.apache.org/docs/master/filesystems/s3/). ::: +:::note +The hive backend does not support the kerberos authentication now. +::: Any properties not defined by Gravitino with `gravitino.bypass.` prefix will pass to Paimon catalog properties and HDFS configuration. For example, if specify `gravitino.bypass.table.type`, `table.type` will pass to Paimon catalog properties.