Skip to content

Commit

Permalink
[apache#4089] fix(hive catalog): the problem of slow acquisition of h…
Browse files Browse the repository at this point in the history
…ive table list (apache#4469)

### What changes were proposed in this pull request?

the problem of slow acquisition of hive table list.
Using listTableNamesByFilter replace the getTableObjectsByName method.


### Why are the changes needed?

I found that list-table will takes 300s when a schema has 5000 tables .

Fix: apache#4089 

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

Manual testing

---------

Co-authored-by: ericqin <[email protected]>
Co-authored-by: mchades <[email protected]>
  • Loading branch information
3 people authored Nov 4, 2024
1 parent 452d97e commit 6d05ec4
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@
import static org.apache.gravitino.catalog.hive.HiveCatalogPropertiesMeta.LIST_ALL_TABLES;
import static org.apache.gravitino.catalog.hive.HiveCatalogPropertiesMeta.METASTORE_URIS;
import static org.apache.gravitino.catalog.hive.HiveCatalogPropertiesMeta.PRINCIPAL;
import static org.apache.gravitino.catalog.hive.HiveTable.ICEBERG_TABLE_TYPE_VALUE;
import static org.apache.gravitino.catalog.hive.HiveTable.SUPPORT_TABLE_TYPES;
import static org.apache.gravitino.catalog.hive.HiveTable.TABLE_TYPE_PROP;
import static org.apache.gravitino.catalog.hive.HiveTablePropertiesMetadata.COMMENT;
import static org.apache.gravitino.catalog.hive.HiveTablePropertiesMetadata.TABLE_TYPE;
import static org.apache.gravitino.connector.BaseCatalog.CATALOG_BYPASS_PREFIX;
Expand Down Expand Up @@ -93,6 +91,7 @@
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.UnknownDBException;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.thrift.TException;
Expand All @@ -116,7 +115,10 @@ public class HiveCatalogOperations implements CatalogOperations, SupportsSchemas
private ScheduledThreadPoolExecutor checkTgtExecutor;
private String kerberosRealm;
private ProxyPlugin proxyPlugin;
boolean listAllTables = true;
private boolean listAllTables = true;
// The maximum number of tables that can be returned by the listTableNamesByFilter function.
// The default value is -1, which means that all tables are returned.
private static final short MAX_TABLES = -1;

// Map that maintains the mapping of keys in Gravitino to that in Hive, for example, users
// will only need to set the configuration 'METASTORE_URL' in Gravitino and Gravitino will change
Expand Down Expand Up @@ -539,23 +541,32 @@ public NameIdentifier[] listTables(Namespace namespace) throws NoSuchSchemaExcep
// then based on
// those names we can obtain metadata for each individual table and get the type we needed.
List<String> allTables = clientPool.run(c -> c.getAllTables(schemaIdent.name()));
return clientPool.run(
c ->
c.getTableObjectsByName(schemaIdent.name(), allTables).stream()
.filter(
tb -> {
boolean isSupportTable = SUPPORT_TABLE_TYPES.contains(tb.getTableType());
if (!isSupportTable) {
return false;
}
if (!listAllTables) {
Map<String, String> parameters = tb.getParameters();
return isHiveTable(parameters);
}
return true;
})
.map(tb -> NameIdentifier.of(namespace, tb.getTableName()))
.toArray(NameIdentifier[]::new));
if (!listAllTables) {
// The reason for using the listTableNamesByFilter function is that the
// getTableObjectiesByName function has poor performance. Currently, we focus on the
// Iceberg, Paimon and Hudi table. In the future, if necessary, we will need to filter out
// other tables. In addition, the current return also includes tables of type VIRTUAL-VIEW.
String icebergAndPaimonFilter = getIcebergAndPaimonFilter();
List<String> icebergAndPaimonTables =
clientPool.run(
c ->
c.listTableNamesByFilter(
schemaIdent.name(), icebergAndPaimonFilter, MAX_TABLES));
allTables.removeAll(icebergAndPaimonTables);

// filter out the Hudi tables
String hudiFilter =
String.format(
"%sprovider like \"hudi\"", hive_metastoreConstants.HIVE_FILTER_FIELD_PARAMS);
List<String> hudiTables =
clientPool.run(
c -> c.listTableNamesByFilter(schemaIdent.name(), hudiFilter, MAX_TABLES));
removeHudiTables(allTables, hudiTables);
}
return allTables.stream()
.map(tbName -> NameIdentifier.of(namespace, tbName))
.toArray(NameIdentifier[]::new);

} catch (UnknownDBException e) {
throw new NoSuchSchemaException(
"Schema (database) does not exist %s in Hive Metastore", namespace);
Expand All @@ -569,20 +580,24 @@ public NameIdentifier[] listTables(Namespace namespace) throws NoSuchSchemaExcep
}
}

boolean isHiveTable(Map<String, String> tableParameters) {
if (isIcebergTable(tableParameters)) return false;
return true;
private static String getIcebergAndPaimonFilter() {
String icebergFilter =
String.format(
"%stable_type like \"ICEBERG\"", hive_metastoreConstants.HIVE_FILTER_FIELD_PARAMS);
String paimonFilter =
String.format(
"%stable_type like \"PAIMON\"", hive_metastoreConstants.HIVE_FILTER_FIELD_PARAMS);
return String.format("%s or %s", icebergFilter, paimonFilter);
}

boolean isIcebergTable(Map<String, String> tableParameters) {
if (tableParameters != null) {
boolean isIcebergTable =
ICEBERG_TABLE_TYPE_VALUE.equalsIgnoreCase(tableParameters.get(TABLE_TYPE_PROP));
if (isIcebergTable) {
return true;
}
private void removeHudiTables(List<String> allTables, List<String> hudiTables) {
for (String hudiTable : hudiTables) {
allTables.removeIf(
t ->
t.equals(hudiTable)
|| t.startsWith(hudiTable + "_ro")
|| t.startsWith(hudiTable + "_rt"));
}
return false;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import static org.apache.gravitino.rel.expressions.transforms.Transforms.identity;
import static org.apache.hadoop.hive.metastore.TableType.EXTERNAL_TABLE;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import java.time.Instant;
import java.util.Arrays;
Expand Down Expand Up @@ -70,14 +71,14 @@ public class TestHiveTable extends MiniHiveMetastoreService {
NameIdentifier.of(META_LAKE_NAME, HIVE_CATALOG_NAME, HIVE_SCHEMA_NAME);

@BeforeAll
private static void setup() {
public static void setup() {
hiveCatalog = initHiveCatalog();
hiveCatalogOperations = (HiveCatalogOperations) hiveCatalog.ops();
hiveSchema = initHiveSchema();
}

@AfterEach
private void resetSchema() {
public void resetSchema() {
hiveCatalogOperations.dropSchema(schemaIdent, true);
hiveSchema = initHiveSchema();
}
Expand Down Expand Up @@ -343,7 +344,35 @@ public void testDropHiveTable() {
}

@Test
public void testListTableException() {
public void testListTable() {
// mock iceberg table and hudi table
NameIdentifier icebergTableIdent =
NameIdentifier.of(META_LAKE_NAME, hiveCatalog.name(), hiveSchema.name(), "iceberg_table");
NameIdentifier hudiTableIdent =
NameIdentifier.of(META_LAKE_NAME, hiveCatalog.name(), hiveSchema.name(), "hudi_table");

hiveCatalogOperations.createTable(
icebergTableIdent,
new Column[] {
HiveColumn.builder().withName("col_1").withType(Types.ByteType.get()).build()
},
HIVE_COMMENT,
ImmutableMap.of("table_type", "ICEBERG"));
hiveCatalogOperations.createTable(
hudiTableIdent,
new Column[] {
HiveColumn.builder().withName("col_1").withType(Types.ByteType.get()).build()
},
HIVE_COMMENT,
ImmutableMap.of("provider", "hudi"));

// test list table
NameIdentifier[] tableIdents =
hiveCatalogOperations.listTables(
Namespace.of("metalake", hiveCatalog.name(), hiveSchema.name()));
Assertions.assertEquals(0, tableIdents.length);

// test exception
Namespace tableNs = Namespace.of("metalake", hiveCatalog.name(), "not_exist_db");
TableCatalog tableCatalog = hiveCatalogOperations;
Throwable exception =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import org.apache.gravitino.CatalogChange;
import org.apache.gravitino.MetalakeChange;
import org.apache.gravitino.NameIdentifier;
import org.apache.gravitino.Namespace;
import org.apache.gravitino.Schema;
import org.apache.gravitino.SchemaChange;
import org.apache.gravitino.SupportsSchemas;
Expand Down Expand Up @@ -613,6 +614,25 @@ public void testHiveTableProperties() throws TException, InterruptedException {
Assertions.assertTrue(exception.getMessage().contains("cannot be set"));
}

@Test
public void testListTables() {
// mock iceberg, paimon, and hudi tables
NameIdentifier icebergTable = NameIdentifier.of(schemaName, "iceberg_table");
NameIdentifier paimonTable = NameIdentifier.of(schemaName, "paimon_table");
NameIdentifier hudiTable = NameIdentifier.of(schemaName, "hudi_table");
catalog
.asTableCatalog()
.createTable(icebergTable, createColumns(), null, ImmutableMap.of("table_type", "ICEBERG"));
catalog
.asTableCatalog()
.createTable(paimonTable, createColumns(), null, ImmutableMap.of("table_type", "PAIMON"));
catalog
.asTableCatalog()
.createTable(hudiTable, createColumns(), null, ImmutableMap.of("provider", "hudi"));
NameIdentifier[] tables = catalog.asTableCatalog().listTables(Namespace.of(schemaName));
Assertions.assertEquals(0, tables.length);
}

@Test
public void testHiveSchemaProperties() throws TException, InterruptedException {
// test LOCATION property
Expand Down
9 changes: 8 additions & 1 deletion docs/apache-hive-catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,14 @@ Besides the [common catalog properties](./gravitino-server-config.md#gravitino-c
| `kerberos.keytab-uri` | The uri of key tab for the catalog. Now supported protocols are `https`, `http`, `ftp`, `file`. | (none) | required if you use kerberos | 0.4.0 |
| `kerberos.check-interval-sec` | The interval to check validness of the principal | 60 | No | 0.4.0 |
| `kerberos.keytab-fetch-timeout-sec` | The timeout to fetch key tab | 60 | No | 0.4.0 |
| `list-all-tables` | Lists all tables in a database, including non-Hive tables, such as Iceberg, etc | false | No | 0.5.1 |
| `list-all-tables` | Lists all tables in a database, including non-Hive tables, such as Iceberg, Hudi, etc. | false | No | 0.5.1 |

:::note
For `list-all-tables=false`, the Hive catalog will filter out:
- Iceberg tables by table property `table_type=ICEBERG`
- Paimon tables by table property `table_type=PAINMON`
- Hudi tables by table property `provider=hudi`
:::

When you use the Gravitino with Trino. You can pass the Trino Hive connector configuration using prefix `trino.bypass.`. For example, using `trino.bypass.hive.config.resources` to pass the `hive.config.resources` to the Gravitino Hive catalog in Trino runtime.

Expand Down

0 comments on commit 6d05ec4

Please sign in to comment.