diff --git a/docs/src/main/sphinx/connector/hive-s3.md b/docs/src/main/sphinx/connector/hive-s3.md index 70c11a6b82198..8cfdfb450de7a 100644 --- a/docs/src/main/sphinx/connector/hive-s3.md +++ b/docs/src/main/sphinx/connector/hive-s3.md @@ -312,85 +312,3 @@ classpath and must be able to communicate with your custom key management system the `org.apache.hadoop.conf.Configurable` interface from the Hadoop Java API, then the Hadoop configuration is passed in after the object instance is created, and before it is asked to provision or retrieve any encryption keys. - -(s3selectpushdown)= - -## S3 Select pushdown - -S3 Select pushdown enables pushing down projection (SELECT) and predicate (WHERE) -processing to [S3 Select](https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html). -With S3 Select Pushdown, Trino only retrieves the required data from S3 instead -of entire S3 objects, reducing both latency and network usage. - -### Is S3 Select a good fit for my workload? - -Performance of S3 Select pushdown depends on the amount of data filtered by the -query. Filtering a large number of rows should result in better performance. If -the query doesn't filter any data, then pushdown may not add any additional value -and the user is charged for S3 Select requests. Thus, we recommend that you -benchmark your workloads with and without S3 Select to see if using it may be -suitable for your workload. By default, S3 Select Pushdown is disabled and you -should enable it in production after proper benchmarking and cost analysis. For -more information on S3 Select request cost, please see -[Amazon S3 Cloud Storage Pricing](https://aws.amazon.com/s3/pricing/). - -Use the following guidelines to determine if S3 Select is a good fit for your -workload: - -- Your query filters out more than half of the original data set. -- Your query filter predicates use columns that have a data type supported by - Trino and S3 Select. - The `TIMESTAMP`, `DECIMAL`, `REAL`, and `DOUBLE` data types are not - supported by S3 Select Pushdown. For more information about supported data - types for S3 Select, see the - [Data Types documentation](https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference-data-types.html). -- Your network connection between Amazon S3 and the Amazon EMR cluster has good - transfer speed and available bandwidth. Amazon S3 Select does not compress - HTTP responses, so the response size may increase for compressed input files. - -### Considerations and limitations - -- Only objects stored in JSON format are supported. Objects can be uncompressed, - or optionally compressed with gzip or bzip2. -- The "AllowQuotedRecordDelimiters" property is not supported. If this property - is specified, the query fails. -- Amazon S3 server-side encryption with customer-provided encryption keys - (SSE-C) and client-side encryption are not supported. -- S3 Select Pushdown is not a substitute for using columnar or compressed file - formats such as ORC and Parquet. - -### Enabling S3 Select pushdown - -You can enable S3 Select Pushdown using the `s3_select_pushdown_enabled` -Hive session property, or using the `hive.s3select-pushdown.enabled` -configuration property. The session property overrides the config -property, allowing you enable or disable on a per-query basis. Non-filtering -queries (`SELECT * FROM table`) are not pushed down to S3 Select, -as they retrieve the entire object content. - -For uncompressed files, S3 Select scans ranges of bytes in parallel. The scan range -requests run across the byte ranges of the internal Hive splits for the query fragments -pushed down to S3 Select. Changes in the Hive connector {ref}`performance tuning -configuration properties ` are likely to impact -S3 Select pushdown performance. - -S3 Select can be enabled for TEXTFILE data using the -`hive.s3select-pushdown.experimental-textfile-pushdown-enabled` configuration property, -however this has been shown to produce incorrect results. For more information see -[the GitHub Issue.](https://github.com/trinodb/trino/issues/17775) - -### Understanding and tuning the maximum connections - -Trino can use its native S3 file system or EMRFS. When using the native FS, the -maximum connections is configured via the `hive.s3.max-connections` -configuration property. When using EMRFS, the maximum connections is configured -via the `fs.s3.maxConnections` Hadoop configuration property. - -S3 Select Pushdown bypasses the file systems, when accessing Amazon S3 for -predicate operations. In this case, the value of -`hive.s3select-pushdown.max-connections` determines the maximum number of -client connections allowed for those operations from worker nodes. - -If your workload experiences the error *Timeout waiting for connection from -pool*, increase the value of both `hive.s3select-pushdown.max-connections` and -the maximum connections configuration for the file system you are using. diff --git a/docs/src/main/sphinx/connector/hive.md b/docs/src/main/sphinx/connector/hive.md index c972867bacee7..05875dae76565 100644 --- a/docs/src/main/sphinx/connector/hive.md +++ b/docs/src/main/sphinx/connector/hive.md @@ -253,16 +253,6 @@ Hive connector documentation. - Enables automatic column level statistics collection on write. See `Table Statistics <#table-statistics>`__ for details. - ``true`` - * - ``hive.s3select-pushdown.enabled`` - - Enable query pushdown to JSON files using the AWS S3 Select service. - - ``false`` - * - ``hive.s3select-pushdown.experimental-textfile-pushdown-enabled`` - - Enable query pushdown to TEXTFILE tables using the AWS S3 Select service. - - ``false`` - * - ``hive.s3select-pushdown.max-connections`` - - Maximum number of simultaneously open connections to S3 for - :ref:`s3selectpushdown`. - - 500 * - ``hive.file-status-cache-tables`` - Cache directory listing for specific tables. Examples: diff --git a/docs/src/main/sphinx/release/release-300.md b/docs/src/main/sphinx/release/release-300.md index ae16f369073f2..ecf4917cac99b 100644 --- a/docs/src/main/sphinx/release/release-300.md +++ b/docs/src/main/sphinx/release/release-300.md @@ -46,7 +46,7 @@ (e.g., min > max). To disable this behavior, set the configuration property `hive.parquet.fail-on-corrupted-statistics` or session property `parquet_fail_with_corrupted_statistics` to false. -- Add support for {ref}`s3selectpushdown`, which enables pushing down +- Add support for S3 Select pushdown, which enables pushing down column selection and range filters into S3 for text files. ## Kudu connector diff --git a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureBlobFileIterator.java b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureBlobFileIterator.java index d221dd4947c77..47b79a49dae96 100644 --- a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureBlobFileIterator.java +++ b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureBlobFileIterator.java @@ -30,15 +30,13 @@ final class AzureBlobFileIterator { private final AzureLocation location; private final Iterator iterator; - private final String base; + private final Location baseLocation; AzureBlobFileIterator(AzureLocation location, Iterator iterator) { this.location = requireNonNull(location, "location is null"); this.iterator = requireNonNull(iterator, "iterator is null"); - this.base = "abfs://%s%s.dfs.core.windows.net".formatted( - location.container().map(container -> container + "@").orElse(""), - location.account()); + this.baseLocation = location.baseLocation(); } @Override @@ -60,7 +58,7 @@ public FileEntry next() try { BlobItem blobItem = iterator.next(); return new FileEntry( - Location.of(base + "/" + blobItem.getName()), + baseLocation.appendPath(blobItem.getName()), blobItem.getProperties().getContentLength(), blobItem.getProperties().getLastModified().toInstant(), Optional.empty()); diff --git a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureDataLakeFileIterator.java b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureDataLakeFileIterator.java index 14d933cdba1f0..50dc7a38c5a1a 100644 --- a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureDataLakeFileIterator.java +++ b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureDataLakeFileIterator.java @@ -23,21 +23,20 @@ import java.util.Optional; import static io.trino.filesystem.azure.AzureUtils.handleAzureException; +import static java.util.Objects.requireNonNull; final class AzureDataLakeFileIterator implements FileIterator { private final AzureLocation location; private final Iterator iterator; - private final String base; + private final Location baseLocation; AzureDataLakeFileIterator(AzureLocation location, Iterator iterator) { - this.location = location; - this.iterator = iterator; - this.base = "abfs://%s%s.dfs.core.windows.net".formatted( - location.container().map(container -> container + "@").orElse(""), - location.account()); + this.location = requireNonNull(location, "location is null"); + this.iterator = requireNonNull(iterator, "iterator is null"); + this.baseLocation = location.baseLocation(); } @Override @@ -59,7 +58,7 @@ public FileEntry next() try { PathItem pathItem = iterator.next(); return new FileEntry( - Location.of(base + "/" + pathItem.getName()), + baseLocation.appendPath(pathItem.getName()), pathItem.getContentLength(), pathItem.getLastModified().toInstant(), Optional.empty()); diff --git a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystem.java b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystem.java index ca1c4728b0cda..4292ba294df2f 100644 --- a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystem.java +++ b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureFileSystem.java @@ -33,6 +33,7 @@ import com.azure.storage.file.datalake.models.ListPathsOptions; import com.azure.storage.file.datalake.models.PathItem; import com.azure.storage.file.datalake.options.DataLakePathDeleteOptions; +import com.google.common.collect.ImmutableSet; import io.airlift.units.DataSize; import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; @@ -44,9 +45,11 @@ import java.util.Objects; import java.util.Optional; import java.util.OptionalLong; +import java.util.Set; import static com.azure.storage.common.implementation.Constants.HeaderConstants.ETAG_WILDCARD; import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static io.trino.filesystem.azure.AzureUtils.handleAzureException; import static java.lang.Math.toIntExact; import static java.util.Objects.requireNonNull; @@ -220,13 +223,10 @@ public FileIterator listFiles(Location location) { AzureLocation azureLocation = new AzureLocation(location); try { - // blob api returns directories as blobs, so it can not be used when Gen2 is enabled - if (isHierarchicalNamespaceEnabled(azureLocation)) { - return listGen2Files(azureLocation); - } - else { - return listBlobFiles(azureLocation); - } + // blob API returns directories as blobs, so it cannot be used when Gen2 is enabled + return (isHierarchicalNamespaceEnabled(azureLocation)) + ? listGen2Files(azureLocation) + : listBlobFiles(azureLocation); } catch (RuntimeException e) { throw handleAzureException(e, "listing files", azureLocation); @@ -258,7 +258,7 @@ private FileIterator listGen2Files(AzureLocation location) .iterator()); } - private AzureBlobFileIterator listBlobFiles(AzureLocation location) + private FileIterator listBlobFiles(AzureLocation location) { String path = location.path(); if (!path.isEmpty() && !path.endsWith("/")) { @@ -354,6 +354,60 @@ public void renameDirectory(Location source, Location target) } } + @Override + public Set listDirectories(Location location) + throws IOException + { + AzureLocation azureLocation = new AzureLocation(location); + try { + // blob API returns directories as blobs, so it cannot be used when Gen2 is enabled + return (isHierarchicalNamespaceEnabled(azureLocation)) + ? listGen2Directories(azureLocation) + : listBlobDirectories(azureLocation); + } + catch (RuntimeException e) { + throw handleAzureException(e, "listing files", azureLocation); + } + } + + private Set listGen2Directories(AzureLocation location) + throws IOException + { + DataLakeFileSystemClient fileSystemClient = createFileSystemClient(location); + PagedIterable pathItems; + if (location.path().isEmpty()) { + pathItems = fileSystemClient.listPaths(); + } + else { + DataLakeDirectoryClient directoryClient = fileSystemClient.getDirectoryClient(location.path()); + if (!directoryClient.exists()) { + return ImmutableSet.of(); + } + if (!directoryClient.getProperties().isDirectory()) { + throw new IOException("Location is not a directory: " + location); + } + pathItems = directoryClient.listPaths(true, false, null, null); + } + Location baseLocation = location.baseLocation(); + return pathItems.stream() + .filter(PathItem::isDirectory) + .map(item -> baseLocation.appendPath(item.getName())) + .collect(toImmutableSet()); + } + + private Set listBlobDirectories(AzureLocation location) + { + String path = location.path(); + if (!path.isEmpty() && !path.endsWith("/")) { + path += "/"; + } + return createBlobContainerClient(location) + .listBlobsByHierarchy(path).stream() + .filter(BlobItem::isPrefix) + .map(item -> Location.of(location + "/" + item.getName())) + .collect(toImmutableSet()); + } + private boolean isHierarchicalNamespaceEnabled(AzureLocation location) throws IOException { diff --git a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureLocation.java b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureLocation.java index 15336a3473ccb..c2fc47b5cfc88 100644 --- a/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureLocation.java +++ b/lib/trino-filesystem-azure/src/main/java/io/trino/filesystem/azure/AzureLocation.java @@ -113,4 +113,11 @@ public String toString() { return location.toString(); } + + public Location baseLocation() + { + return Location.of("abfs://%s%s.dfs.core.windows.net/".formatted( + container().map(container -> container + "@").orElse(""), + account())); + } } diff --git a/lib/trino-filesystem-manager/src/main/java/io/trino/filesystem/manager/SwitchingFileSystem.java b/lib/trino-filesystem-manager/src/main/java/io/trino/filesystem/manager/SwitchingFileSystem.java index 670da1f2fea30..041a89a720c67 100644 --- a/lib/trino-filesystem-manager/src/main/java/io/trino/filesystem/manager/SwitchingFileSystem.java +++ b/lib/trino-filesystem-manager/src/main/java/io/trino/filesystem/manager/SwitchingFileSystem.java @@ -27,6 +27,7 @@ import java.util.Collection; import java.util.Map; import java.util.Optional; +import java.util.Set; import static com.google.common.base.Preconditions.checkArgument; import static java.util.Objects.requireNonNull; @@ -130,6 +131,13 @@ public void renameDirectory(Location source, Location target) fileSystem(source).renameDirectory(source, target); } + @Override + public Set listDirectories(Location location) + throws IOException + { + return fileSystem(location).listDirectories(location); + } + private TrinoFileSystem fileSystem(Location location) { return createFileSystem(determineFactory(location)); diff --git a/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3FileIterator.java b/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3FileIterator.java index c63581225d800..e37ca49d6a7ab 100644 --- a/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3FileIterator.java +++ b/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3FileIterator.java @@ -31,13 +31,13 @@ final class S3FileIterator { private final S3Location location; private final Iterator iterator; - private final String base; + private final Location baseLocation; public S3FileIterator(S3Location location, Iterator iterator) { this.location = requireNonNull(location, "location is null"); this.iterator = requireNonNull(iterator, "iterator is null"); - this.base = "%s://%s/".formatted(location.scheme(), location.bucket()); + this.baseLocation = location.baseLocation(); } @Override @@ -62,7 +62,7 @@ public FileEntry next() verify(object.key().startsWith(location.key()), "S3 listed key [%s] does not start with prefix [%s]", object.key(), location.key()); return new FileEntry( - Location.of(base + object.key()), + baseLocation.appendPath(object.key()), object.size(), object.lastModified(), Optional.empty()); diff --git a/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3FileSystem.java b/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3FileSystem.java index 904b237078f0d..30d3b4f3e133a 100644 --- a/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3FileSystem.java +++ b/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3FileSystem.java @@ -22,6 +22,7 @@ import io.trino.filesystem.TrinoOutputFile; import software.amazon.awssdk.core.exception.SdkException; import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.CommonPrefix; import software.amazon.awssdk.services.s3.model.DeleteObjectRequest; import software.amazon.awssdk.services.s3.model.DeleteObjectsRequest; import software.amazon.awssdk.services.s3.model.DeleteObjectsResponse; @@ -39,7 +40,9 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Optional; +import java.util.Set; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Iterables.partition; import static com.google.common.collect.Multimaps.toMultimap; import static java.util.Objects.requireNonNull; @@ -211,6 +214,36 @@ public void renameDirectory(Location source, Location target) throw new IOException("S3 does not support directory renames"); } + @Override + public Set listDirectories(Location location) + throws IOException + { + S3Location s3Location = new S3Location(location); + Location baseLocation = s3Location.baseLocation(); + + String key = s3Location.key(); + if (!key.isEmpty() && !key.endsWith("/")) { + key += "/"; + } + + ListObjectsV2Request request = ListObjectsV2Request.builder() + .bucket(s3Location.bucket()) + .prefix(key) + .delimiter("/") + .build(); + + try { + return client.listObjectsV2Paginator(request) + .commonPrefixes().stream() + .map(CommonPrefix::prefix) + .map(baseLocation::appendPath) + .collect(toImmutableSet()); + } + catch (SdkException e) { + throw new IOException("Failed to list location: " + location, e); + } + } + @SuppressWarnings("ResultOfObjectAllocationIgnored") private static void validateS3Location(Location location) { diff --git a/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3Location.java b/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3Location.java index 059453b16b03e..d4e2932c731a3 100644 --- a/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3Location.java +++ b/lib/trino-filesystem-s3/src/main/java/io/trino/filesystem/s3/S3Location.java @@ -52,4 +52,9 @@ public String toString() { return location.toString(); } + + public Location baseLocation() + { + return Location.of("%s://%s/".formatted(scheme(), bucket())); + } } diff --git a/lib/trino-filesystem-s3/src/test/java/io/trino/filesystem/s3/TestS3FileSystemMinIo.java b/lib/trino-filesystem-s3/src/test/java/io/trino/filesystem/s3/TestS3FileSystemMinIo.java index 99b9251ba8285..f60672a4d92bf 100644 --- a/lib/trino-filesystem-s3/src/test/java/io/trino/filesystem/s3/TestS3FileSystemMinIo.java +++ b/lib/trino-filesystem-s3/src/test/java/io/trino/filesystem/s3/TestS3FileSystemMinIo.java @@ -108,4 +108,13 @@ public void testDeleteDirectory() // MinIO is not hierarchical but has hierarchical naming constraints. For example it's not possible to have two blobs "level0" and "level0/level1". testDeleteDirectory(true); } + + @Test + @Override + public void testListDirectories() + throws IOException + { + // MinIO is not hierarchical but has hierarchical naming constraints. For example it's not possible to have two blobs "level0" and "level0/level1". + testListDirectories(true); + } } diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/TrinoFileSystem.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/TrinoFileSystem.java index 9a3aebfb27b15..bc4d79ddb40ab 100644 --- a/lib/trino-filesystem/src/main/java/io/trino/filesystem/TrinoFileSystem.java +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/TrinoFileSystem.java @@ -16,6 +16,7 @@ import java.io.IOException; import java.util.Collection; import java.util.Optional; +import java.util.Set; /** * TrinoFileSystem is the main abstraction for Trino to interact with data in cloud-like storage @@ -190,4 +191,20 @@ void createDirectory(Location location) */ void renameDirectory(Location source, Location target) throws IOException; + + /** + * Lists all directories that are direct descendants of the specified directory. + * The location can be empty, which lists all directories at the root of the file system, + * otherwise the location otherwise the location must end with a slash. + * If the location does not exist, an empty set is returned. + *

+ * For hierarchical file systems, if the path is not a directory, an exception is raised. + * For hierarchical file systems, if the path does not reference an existing directory, + * an empty iterator is returned. For blob file systems, all directories containing + * blobs that start with the location are listed. + * + * @throws IllegalArgumentException if location is not valid for this file system + */ + Set listDirectories(Location location) + throws IOException; } diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/local/LocalFileSystem.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/local/LocalFileSystem.java index 457dabe7a4148..f96315bf86a88 100644 --- a/lib/trino-filesystem/src/main/java/io/trino/filesystem/local/LocalFileSystem.java +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/local/LocalFileSystem.java @@ -13,6 +13,7 @@ */ package io.trino.filesystem.local; +import com.google.common.collect.ImmutableSet; import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; @@ -26,9 +27,13 @@ import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.util.Optional; +import java.util.Set; +import java.util.stream.Stream; import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static io.trino.filesystem.local.LocalUtils.handleException; +import static java.nio.file.LinkOption.NOFOLLOW_LINKS; /** * A hierarchical file system for testing. @@ -194,6 +199,26 @@ public void renameDirectory(Location source, Location target) } } + @Override + public Set listDirectories(Location location) + throws IOException + { + Path path = toDirectoryPath(location); + if (Files.isRegularFile(path)) { + throw new IOException("Location is a file: " + location); + } + if (!Files.isDirectory(path)) { + return ImmutableSet.of(); + } + try (Stream stream = Files.list(path)) { + return stream + .filter(file -> Files.isDirectory(file, NOFOLLOW_LINKS)) + .map(file -> file.getFileName() + "/") + .map(location::appendPath) + .collect(toImmutableSet()); + } + } + private Path toFilePath(Location location) { validateLocalLocation(location); diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemoryFileSystem.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemoryFileSystem.java index c096e3a50cec0..303f93b79fc32 100644 --- a/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemoryFileSystem.java +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/memory/MemoryFileSystem.java @@ -13,6 +13,7 @@ */ package io.trino.filesystem.memory; +import com.google.common.collect.ImmutableSet; import io.airlift.slice.Slice; import io.trino.filesystem.FileEntry; import io.trino.filesystem.FileIterator; @@ -28,6 +29,7 @@ import java.util.Iterator; import java.util.Optional; import java.util.OptionalLong; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; @@ -180,6 +182,23 @@ public void renameDirectory(Location source, Location target) throw new IOException("Memory file system does not support directory renames"); } + @Override + public Set listDirectories(Location location) + throws IOException + { + String prefix = toBlobPrefix(location); + ImmutableSet.Builder directories = ImmutableSet.builder(); + for (String key : blobs.keySet()) { + if (key.startsWith(prefix)) { + int index = key.indexOf('/', prefix.length() + 1); + if (index >= 0) { + directories.add(Location.of("memory:///" + key.substring(0, index + 1))); + } + } + } + return directories.build(); + } + private static String toBlobKey(Location location) { validateMemoryLocation(location); diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/tracing/TracingFileSystem.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/tracing/TracingFileSystem.java index a1b69b42199da..2e0c694d05529 100644 --- a/lib/trino-filesystem/src/main/java/io/trino/filesystem/tracing/TracingFileSystem.java +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/tracing/TracingFileSystem.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.util.Collection; import java.util.Optional; +import java.util.Set; import static io.trino.filesystem.tracing.Tracing.withTracing; import static java.util.Objects.requireNonNull; @@ -137,4 +138,14 @@ public void renameDirectory(Location source, Location target) .startSpan(); withTracing(span, () -> delegate.renameDirectory(source, target)); } + + @Override + public Set listDirectories(Location location) + throws IOException + { + Span span = tracer.spanBuilder("FileSystem.listDirectories") + .setAttribute(FileSystemAttributes.FILE_LOCATION, location.toString()) + .startSpan(); + return withTracing(span, () -> delegate.listDirectories(location)); + } } diff --git a/lib/trino-filesystem/src/test/java/io/trino/filesystem/AbstractTestTrinoFileSystem.java b/lib/trino-filesystem/src/test/java/io/trino/filesystem/AbstractTestTrinoFileSystem.java index df5a088d914d1..64e28235cedfd 100644 --- a/lib/trino-filesystem/src/test/java/io/trino/filesystem/AbstractTestTrinoFileSystem.java +++ b/lib/trino-filesystem/src/test/java/io/trino/filesystem/AbstractTestTrinoFileSystem.java @@ -964,6 +964,7 @@ public void testRenameDirectory() assertThat(getFileSystem().directoryExists(createLocation("level0/level1"))).contains(true); assertThat(getFileSystem().directoryExists(createLocation("level0"))).contains(true); + // rename interior directory getFileSystem().renameDirectory(createLocation("level0/level1"), createLocation("level0/renamed")); assertThat(getFileSystem().directoryExists(createLocation("level0/level1"))).contains(false); @@ -973,11 +974,85 @@ public void testRenameDirectory() assertThat(getFileSystem().newInputFile(blob).exists()).isFalse(); - assertThat(readLocation(createLocation("level0/renamed/level2-file"))) + Location renamedBlob = createLocation("level0/renamed/level2-file"); + assertThat(readLocation(renamedBlob)) .isEqualTo(TEST_BLOB_CONTENT_PREFIX + blob); + + // rename to existing directory is an error + Location blob2 = createBlob(closer, "abc/xyz-file"); + + assertThat(getFileSystem().directoryExists(createLocation("abc"))).contains(true); + + assertThatThrownBy(() -> getFileSystem().renameDirectory(createLocation("abc"), createLocation("level0"))) + .isInstanceOf(IOException.class) + .hasMessageContaining(createLocation("abc").toString()) + .hasMessageContaining(createLocation("level0").toString()); + + assertThat(getFileSystem().newInputFile(blob2).exists()).isTrue(); + assertThat(getFileSystem().newInputFile(renamedBlob).exists()).isTrue(); + } + } + + @Test + public void testListDirectories() + throws IOException + { + testListDirectories(isHierarchical()); + } + + protected void testListDirectories(boolean hierarchicalNamingConstraints) + throws IOException + { + try (Closer closer = Closer.create()) { + createTestDirectoryStructure(closer, hierarchicalNamingConstraints); + createBlob(closer, "level0/level1/level2/level3-file0"); + createBlob(closer, "level0/level1x/level2x-file0"); + createBlob(closer, "other/file"); + + assertThat(listDirectories("")).containsOnly( + createLocation("level0/"), + createLocation("other/")); + + assertThat(listDirectories("level0")).containsOnly( + createLocation("level0/level1/"), + createLocation("level0/level1x/")); + assertThat(listDirectories("level0/")).containsOnly( + createLocation("level0/level1/"), + createLocation("level0/level1x/")); + + assertThat(listDirectories("level0/level1")).containsOnly( + createLocation("level0/level1/level2/")); + assertThat(listDirectories("level0/level1/")).containsOnly( + createLocation("level0/level1/level2/")); + + assertThat(listDirectories("level0/level1/level2/level3")).isEmpty(); + assertThat(listDirectories("level0/level1/level2/level3/")).isEmpty(); + + assertThat(listDirectories("unknown")).isEmpty(); + assertThat(listDirectories("unknown/")).isEmpty(); + + if (isHierarchical()) { + assertThatThrownBy(() -> listDirectories("level0-file0")) + .isInstanceOf(IOException.class) + .hasMessageContaining(createLocation("level0-file0").toString()); + } + else { + assertThat(listDirectories("level0-file0")).isEmpty(); + } + + if (!hierarchicalNamingConstraints && !normalizesListFilesResult()) { + // this lists a path in a directory with an empty name + assertThat(listDirectories("/")).isEmpty(); + } } } + private Set listDirectories(String path) + throws IOException + { + return getFileSystem().listDirectories(createListingLocation(path)); + } + private List listPath(String path) throws IOException { diff --git a/lib/trino-filesystem/src/test/java/io/trino/filesystem/TrackingFileSystemFactory.java b/lib/trino-filesystem/src/test/java/io/trino/filesystem/TrackingFileSystemFactory.java index 1fe8c17403c16..b3ba14f5b9972 100644 --- a/lib/trino-filesystem/src/test/java/io/trino/filesystem/TrackingFileSystemFactory.java +++ b/lib/trino-filesystem/src/test/java/io/trino/filesystem/TrackingFileSystemFactory.java @@ -24,6 +24,7 @@ import java.util.Map; import java.util.Optional; import java.util.OptionalLong; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; @@ -184,6 +185,13 @@ public void renameDirectory(Location source, Location target) { delegate.renameDirectory(source, target); } + + @Override + public Set listDirectories(Location location) + throws IOException + { + return delegate.listDirectories(location); + } } private static class TrackingInputFile diff --git a/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileIterator.java b/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileIterator.java index fcfa4da3c0a4d..78fc58fbe7b3b 100644 --- a/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileIterator.java +++ b/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileIterator.java @@ -84,25 +84,28 @@ public FileEntry next() throw new IOException("Listing location is a file, not a directory: " + listingLocation); } - String root = listingPath.toUri().getPath(); - String path = status.getPath().toUri().getPath(); - - verify(path.startsWith(root), "iterator path [%s] not a child of listing path [%s] for location [%s]", path, root, listingLocation); - - int index = root.endsWith("/") ? root.length() : root.length() + 1; - Location location = listingLocation.appendPath(path.substring(index)); - List blocks = Stream.of(status.getBlockLocations()) .map(HdfsFileIterator::toTrinoBlock) .collect(toImmutableList()); return new FileEntry( - location, + listedLocation(listingLocation, listingPath, status.getPath()), status.getLen(), Instant.ofEpochMilli(status.getModificationTime()), blocks.isEmpty() ? Optional.empty() : Optional.of(blocks)); } + static Location listedLocation(Location listingLocation, Path listingPath, Path listedPath) + { + String root = listingPath.toUri().getPath(); + String path = listedPath.toUri().getPath(); + + verify(path.startsWith(root), "iterator path [%s] not a child of listing path [%s] for location [%s]", path, root, listingLocation); + + int index = root.endsWith("/") ? root.length() : root.length() + 1; + return listingLocation.appendPath(path.substring(index)); + } + private static Block toTrinoBlock(BlockLocation location) { try { diff --git a/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileSystem.java b/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileSystem.java index ccc66ede3fc5e..aa1337607e8a4 100644 --- a/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileSystem.java +++ b/lib/trino-hdfs/src/main/java/io/trino/filesystem/hdfs/HdfsFileSystem.java @@ -14,6 +14,7 @@ package io.trino.filesystem.hdfs; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import io.airlift.stats.TimeStat; import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; @@ -37,9 +38,13 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Optional; +import java.util.Set; import java.util.UUID; +import java.util.stream.Stream; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static io.trino.filesystem.hdfs.HadoopPaths.hadoopPath; +import static io.trino.filesystem.hdfs.HdfsFileIterator.listedLocation; import static io.trino.hdfs.FileSystemUtils.getRawFileSystem; import static java.util.Objects.requireNonNull; import static java.util.stream.Collectors.groupingBy; @@ -323,6 +328,9 @@ public void renameDirectory(Location source, Location target) if (!fileSystem.getFileStatus(sourcePath).isDirectory()) { throw new IOException("Source location is not a directory"); } + if (fileSystem.exists(targetPath)) { + throw new IOException("Target location already exists"); + } if (!fileSystem.rename(sourcePath, targetPath)) { throw new IOException("rename failed"); } @@ -335,6 +343,38 @@ public void renameDirectory(Location source, Location target) }); } + @Override + public Set listDirectories(Location location) + throws IOException + { + stats.getListDirectoriesCalls().newCall(); + Path directory = hadoopPath(location); + FileSystem fileSystem = environment.getFileSystem(context, directory); + return environment.doAs(context.getIdentity(), () -> { + try (TimeStat.BlockTimer ignored = stats.getListDirectoriesCalls().time()) { + FileStatus[] files = fileSystem.listStatus(directory); + if (files.length == 0) { + return ImmutableSet.of(); + } + if (files[0].getPath().equals(directory)) { + throw new IOException("Location is a file, not a directory: " + location); + } + return Stream.of(files) + .filter(FileStatus::isDirectory) + .map(file -> listedLocation(location, directory, file.getPath())) + .map(file -> file.appendSuffix("/")) + .collect(toImmutableSet()); + } + catch (FileNotFoundException e) { + return ImmutableSet.of(); + } + catch (IOException e) { + stats.getListDirectoriesCalls().recordException(e); + throw new IOException("List directories for %s failed: %s".formatted(location, e.getMessage()), e); + } + }); + } + private boolean hierarchical(FileSystem fileSystem, Location rootLocation) { Boolean knownResult = KNOWN_HIERARCHICAL_FILESYSTEMS.get(fileSystem.getScheme()); diff --git a/lib/trino-hdfs/src/main/java/io/trino/hdfs/ConfigurationUtils.java b/lib/trino-hdfs/src/main/java/io/trino/hdfs/ConfigurationUtils.java index 6e28ad11053bd..9181537b4e939 100644 --- a/lib/trino-hdfs/src/main/java/io/trino/hdfs/ConfigurationUtils.java +++ b/lib/trino-hdfs/src/main/java/io/trino/hdfs/ConfigurationUtils.java @@ -15,7 +15,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapred.JobConf; import java.io.File; import java.util.List; @@ -61,14 +60,6 @@ public static void copy(Configuration from, Configuration to) } } - public static JobConf toJobConf(Configuration conf) - { - if (conf instanceof JobConf) { - return (JobConf) conf; - } - return new JobConf(conf); - } - public static Configuration readConfiguration(List resourcePaths) { Configuration result = newEmptyConfiguration(); diff --git a/lib/trino-hdfs/src/main/java/io/trino/hdfs/HdfsEnvironment.java b/lib/trino-hdfs/src/main/java/io/trino/hdfs/HdfsEnvironment.java index 3d26819980d0b..fecc740176b64 100644 --- a/lib/trino-hdfs/src/main/java/io/trino/hdfs/HdfsEnvironment.java +++ b/lib/trino-hdfs/src/main/java/io/trino/hdfs/HdfsEnvironment.java @@ -125,11 +125,6 @@ public R doAs(ConnectorIdentity identity, GenericExcept return hdfsAuthentication.doAs(identity, action); } - public void doAs(ConnectorIdentity identity, Runnable action) - { - hdfsAuthentication.doAs(identity, action); - } - private static void stopFileSystemStatsThread() { try { diff --git a/lib/trino-hdfs/src/main/java/io/trino/hdfs/TrinoHdfsFileSystemStats.java b/lib/trino-hdfs/src/main/java/io/trino/hdfs/TrinoHdfsFileSystemStats.java index 18f072615d318..5b942370a1374 100644 --- a/lib/trino-hdfs/src/main/java/io/trino/hdfs/TrinoHdfsFileSystemStats.java +++ b/lib/trino-hdfs/src/main/java/io/trino/hdfs/TrinoHdfsFileSystemStats.java @@ -27,6 +27,7 @@ public final class TrinoHdfsFileSystemStats private final CallStats directoryExistsCalls = new CallStats(); private final CallStats createDirectoryCalls = new CallStats(); private final CallStats renameDirectoryCalls = new CallStats(); + private final CallStats listDirectoriesCalls = new CallStats(); @Managed @Nested @@ -90,4 +91,11 @@ public CallStats getRenameDirectoryCalls() { return renameDirectoryCalls; } + + @Managed + @Nested + public CallStats getListDirectoriesCalls() + { + return listDirectoriesCalls; + } } diff --git a/lib/trino-hdfs/src/main/java/io/trino/hdfs/authentication/HdfsAuthentication.java b/lib/trino-hdfs/src/main/java/io/trino/hdfs/authentication/HdfsAuthentication.java index ee98bda27d837..1d0da9a7fb15a 100644 --- a/lib/trino-hdfs/src/main/java/io/trino/hdfs/authentication/HdfsAuthentication.java +++ b/lib/trino-hdfs/src/main/java/io/trino/hdfs/authentication/HdfsAuthentication.java @@ -19,12 +19,4 @@ public interface HdfsAuthentication { R doAs(ConnectorIdentity identity, GenericExceptionAction action) throws E; - - default void doAs(ConnectorIdentity identity, Runnable action) - { - doAs(identity, () -> { - action.run(); - return null; - }); - } } diff --git a/lib/trino-hdfs/src/main/java/io/trino/hdfs/s3/HiveS3Module.java b/lib/trino-hdfs/src/main/java/io/trino/hdfs/s3/HiveS3Module.java index 99aed93534351..501bfcb973f7a 100644 --- a/lib/trino-hdfs/src/main/java/io/trino/hdfs/s3/HiveS3Module.java +++ b/lib/trino-hdfs/src/main/java/io/trino/hdfs/s3/HiveS3Module.java @@ -83,7 +83,6 @@ private void bindSecurityMapping(Binder binder) newSetBinder(binder, DynamicConfigurationProvider.class).addBinding() .to(S3SecurityMappingConfigurationProvider.class).in(Scopes.SINGLETON); - checkArgument(!getProperty("hive.s3select-pushdown.enabled").map(Boolean::parseBoolean).orElse(false), "S3 security mapping is not compatible with S3 Select pushdown"); checkArgument(!buildConfigObject(RubixEnabledConfig.class).isCacheEnabled(), "S3 security mapping is not compatible with Hive caching"); } diff --git a/plugin/trino-hive-hadoop2/bin/run_hive_s3_select_json_tests.sh b/plugin/trino-hive-hadoop2/bin/run_hive_s3_select_json_tests.sh deleted file mode 100755 index 1d7976ede475c..0000000000000 --- a/plugin/trino-hive-hadoop2/bin/run_hive_s3_select_json_tests.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash - -# Similar to run_hive_s3_tests.sh, but has only Amazon S3 Select JSON tests. This is in a separate file as the JsonSerDe -# class is only available in Hadoop 3.1 version, and so we would only test JSON pushdown against the 3.1 version. - -set -euo pipefail -x - -. "${BASH_SOURCE%/*}/common.sh" - -abort_if_not_gib_impacted - -check_vars S3_BUCKET S3_BUCKET_ENDPOINT \ - AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY - -cleanup_hadoop_docker_containers -start_hadoop_docker_containers - -test_directory="$(date '+%Y%m%d-%H%M%S')-$(uuidgen | sha1sum | cut -b 1-6)-s3select-json" - -# insert AWS credentials -deploy_core_site_xml core-site.xml.s3-template \ - AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY S3_BUCKET_ENDPOINT - -# create test tables -# can't use create_test_tables because the first table is created with different commands -table_path="s3a://${S3_BUCKET}/${test_directory}/trino_s3select_test_external_fs_json/" -exec_in_hadoop_master_container hadoop fs -mkdir -p "${table_path}" -exec_in_hadoop_master_container /docker/files/hadoop-put.sh /docker/files/test_table.json{,.gz,.bz2} "${table_path}" -exec_in_hadoop_master_container sudo -Eu hive beeline -u jdbc:hive2://localhost:10000/default -n hive -e " - CREATE EXTERNAL TABLE trino_s3select_test_external_fs_json(col_1 bigint, col_2 bigint) - ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' - LOCATION '${table_path}'" - -table_path="s3a://${S3_BUCKET}/${test_directory}/trino_s3select_test_json_scan_range_pushdown/" -exec_in_hadoop_master_container hadoop fs -mkdir -p "${table_path}" -exec_in_hadoop_master_container /docker/files/hadoop-put.sh /docker/files/test_table_json_scan_range_select_pushdown_{1,2,3}.json "${table_path}" -exec_in_hadoop_master_container sudo -Eu hive beeline -u jdbc:hive2://localhost:10000/default -n hive -e " - CREATE EXTERNAL TABLE trino_s3select_test_json_scan_range_pushdown(col_1 bigint, col_2 string, col_3 string, - col_4 string, col_5 string, col_6 string, col_7 string, col_8 string, col_9 string, col_10 string, col_11 string, - col_12 string, col_13 string, col_14 string) - ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' - LOCATION '${table_path}'" -stop_unnecessary_hadoop_services - -# restart hive-metastore to apply S3 changes in core-site.xml -docker exec "$(hadoop_master_container)" supervisorctl restart hive-metastore -retry check_hadoop - -# run product tests -pushd "${PROJECT_ROOT}" -set +e -./mvnw ${MAVEN_TEST:--B} -pl :trino-hive-hadoop2 test -P test-hive-hadoop2-s3-select-json \ - -DHADOOP_USER_NAME=hive \ - -Dhive.hadoop2.metastoreHost=localhost \ - -Dhive.hadoop2.metastorePort=9083 \ - -Dhive.hadoop2.databaseName=default \ - -Dhive.hadoop2.s3.awsAccessKey="${AWS_ACCESS_KEY_ID}" \ - -Dhive.hadoop2.s3.awsSecretKey="${AWS_SECRET_ACCESS_KEY}" \ - -Dhive.hadoop2.s3.writableBucket="${S3_BUCKET}" \ - -Dhive.hadoop2.s3.testDirectory="${test_directory}" -EXIT_CODE=$? -set -e -popd - -cleanup_hadoop_docker_containers - -exit "${EXIT_CODE}" diff --git a/plugin/trino-hive-hadoop2/bin/run_hive_s3_tests.sh b/plugin/trino-hive-hadoop2/bin/run_hive_s3_tests.sh index 57c3c090bf75f..0b9fb473e6dba 100755 --- a/plugin/trino-hive-hadoop2/bin/run_hive_s3_tests.sh +++ b/plugin/trino-hive-hadoop2/bin/run_hive_s3_tests.sh @@ -46,38 +46,6 @@ exec_in_hadoop_master_container /usr/bin/hive -e " LOCATION '${table_path}' TBLPROPERTIES ('skip.header.line.count'='2', 'skip.footer.line.count'='2')" -table_path="s3a://${S3_BUCKET}/${test_directory}/trino_s3select_test_external_fs_with_pipe_delimiter/" -exec_in_hadoop_master_container hadoop fs -mkdir -p "${table_path}" -exec_in_hadoop_master_container hadoop fs -put -f /docker/files/test_table_with_pipe_delimiter.csv{,.gz,.bz2} "${table_path}" -exec_in_hadoop_master_container /usr/bin/hive -e " - CREATE EXTERNAL TABLE trino_s3select_test_external_fs_with_pipe_delimiter(t_bigint bigint, s_bigint bigint) - ROW FORMAT DELIMITED - FIELDS TERMINATED BY '|' - STORED AS TEXTFILE - LOCATION '${table_path}'" - -table_path="s3a://${S3_BUCKET}/${test_directory}/trino_s3select_test_external_fs_with_comma_delimiter/" -exec_in_hadoop_master_container hadoop fs -mkdir -p "${table_path}" -exec_in_hadoop_master_container hadoop fs -put -f /docker/files/test_table_with_comma_delimiter.csv{,.gz,.bz2} "${table_path}" -exec_in_hadoop_master_container /usr/bin/hive -e " - CREATE EXTERNAL TABLE trino_s3select_test_external_fs_with_comma_delimiter(t_bigint bigint, s_bigint bigint) - ROW FORMAT DELIMITED - FIELDS TERMINATED BY ',' - STORED AS TEXTFILE - LOCATION '${table_path}'" - -table_path="s3a://${S3_BUCKET}/${test_directory}/trino_s3select_test_csv_scan_range_pushdown/" -exec_in_hadoop_master_container hadoop fs -mkdir -p "${table_path}" -exec_in_hadoop_master_container /docker/files/hadoop-put.sh /docker/files/test_table_csv_scan_range_select_pushdown_{1,2,3}.csv "${table_path}" -exec_in_hadoop_master_container sudo -Eu hive beeline -u jdbc:hive2://localhost:10000/default -n hive -e " - CREATE EXTERNAL TABLE trino_s3select_test_csv_scan_range_pushdown(index bigint, id string, value1 bigint, value2 bigint, value3 bigint, - value4 bigint, value5 bigint, title string, firstname string, lastname string, flag string, day bigint, - month bigint, year bigint, country string, comment string, email string, identifier string) - ROW FORMAT DELIMITED - FIELDS TERMINATED BY '|' - STORED AS TEXTFILE - LOCATION '${table_path}'" - stop_unnecessary_hadoop_services # restart hive-metastore to apply S3 changes in core-site.xml diff --git a/plugin/trino-hive-hadoop2/pom.xml b/plugin/trino-hive-hadoop2/pom.xml index 7a271decad4b7..842f419698f83 100644 --- a/plugin/trino-hive-hadoop2/pom.xml +++ b/plugin/trino-hive-hadoop2/pom.xml @@ -186,6 +186,12 @@ test + + io.trino.hive + hive-apache + test + + org.assertj assertj-core @@ -215,10 +221,6 @@ **/TestHive.java **/TestHiveThriftMetastoreWithS3.java **/TestHiveFileSystemS3.java - **/TestHiveFileSystemS3SelectPushdown.java - **/TestHiveFileSystemS3SelectJsonPushdown.java - **/TestHiveFileSystemS3SelectCsvPushdownWithSplits.java - **/TestHiveFileSystemS3SelectJsonPushdownWithSplits.java **/TestHiveFileSystemWasb.java **/TestHiveFileSystemAbfsAccessKey.java **/TestHiveFileSystemAbfsOAuth.java @@ -257,25 +259,6 @@ **/TestHiveThriftMetastoreWithS3.java **/TestHiveFileSystemS3.java - **/TestHiveFileSystemS3SelectPushdown.java - **/TestHiveFileSystemS3SelectCsvPushdownWithSplits.java - - - - - - - - test-hive-hadoop2-s3-select-json - - - - org.apache.maven.plugins - maven-surefire-plugin - - - **/TestHiveFileSystemS3SelectJsonPushdown.java - **/TestHiveFileSystemS3SelectJsonPushdownWithSplits.java diff --git a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystemAbfs.java b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystemAbfs.java index ad253f1166e80..532323a3ffa0f 100644 --- a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystemAbfs.java +++ b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystemAbfs.java @@ -66,7 +66,6 @@ protected void setup(String host, int port, String databaseName, String containe checkParameter(host, "host"), port, checkParameter(databaseName, "database name"), - false, createHdfsConfiguration()); } diff --git a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystemS3.java b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystemS3.java deleted file mode 100644 index 801ea4f667c03..0000000000000 --- a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystemS3.java +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.client.builder.AwsClientBuilder; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3Client; -import com.amazonaws.services.s3.model.ObjectMetadata; -import com.amazonaws.services.s3.model.PutObjectRequest; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Streams; -import com.google.common.net.MediaType; -import io.trino.filesystem.Location; -import io.trino.filesystem.TrinoFileSystem; -import io.trino.filesystem.hdfs.HdfsFileSystemFactory; -import io.trino.hdfs.ConfigurationInitializer; -import io.trino.hdfs.DynamicHdfsConfiguration; -import io.trino.hdfs.HdfsConfig; -import io.trino.hdfs.HdfsConfiguration; -import io.trino.hdfs.HdfsConfigurationInitializer; -import io.trino.hdfs.HdfsNamenodeStats; -import io.trino.hdfs.TrinoHdfsFileSystemStats; -import io.trino.hdfs.s3.HiveS3Config; -import io.trino.hdfs.s3.TrinoS3ConfigurationInitializer; -import io.trino.plugin.hive.fs.FileSystemDirectoryLister; -import io.trino.plugin.hive.fs.HiveFileIterator; -import io.trino.plugin.hive.fs.TrinoFileStatus; -import io.trino.plugin.hive.metastore.Column; -import io.trino.plugin.hive.metastore.StorageFormat; -import io.trino.plugin.hive.metastore.Table; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.List; -import java.util.Optional; - -import static com.google.common.base.Preconditions.checkArgument; -import static io.trino.plugin.hive.HiveTestUtils.SESSION; -import static io.trino.plugin.hive.HiveType.HIVE_LONG; -import static io.trino.plugin.hive.HiveType.HIVE_STRING; -import static java.io.InputStream.nullInputStream; -import static java.lang.String.format; -import static org.assertj.core.api.Assertions.assertThat; -import static org.testng.Assert.assertFalse; -import static org.testng.util.Strings.isNullOrEmpty; - -public abstract class AbstractTestHiveFileSystemS3 - extends AbstractTestHiveFileSystem -{ - private static final MediaType DIRECTORY_MEDIA_TYPE = MediaType.create("application", "x-directory"); - - private String awsAccessKey; - private String awsSecretKey; - private String writableBucket; - private String testDirectory; - private AmazonS3 s3Client; - - protected void setup( - String host, - int port, - String databaseName, - String s3endpoint, - String awsAccessKey, - String awsSecretKey, - String writableBucket, - String testDirectory, - boolean s3SelectPushdownEnabled) - { - checkArgument(!isNullOrEmpty(host), "Expected non empty host"); - checkArgument(!isNullOrEmpty(databaseName), "Expected non empty databaseName"); - checkArgument(!isNullOrEmpty(awsAccessKey), "Expected non empty awsAccessKey"); - checkArgument(!isNullOrEmpty(awsSecretKey), "Expected non empty awsSecretKey"); - checkArgument(!isNullOrEmpty(s3endpoint), "Expected non empty s3endpoint"); - checkArgument(!isNullOrEmpty(writableBucket), "Expected non empty writableBucket"); - checkArgument(!isNullOrEmpty(testDirectory), "Expected non empty testDirectory"); - this.awsAccessKey = awsAccessKey; - this.awsSecretKey = awsSecretKey; - this.writableBucket = writableBucket; - this.testDirectory = testDirectory; - - s3Client = AmazonS3Client.builder() - .withEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration(s3endpoint, null)) - .withCredentials(new AWSStaticCredentialsProvider(new BasicAWSCredentials(awsAccessKey, awsSecretKey))) - .build(); - - setup(host, port, databaseName, s3SelectPushdownEnabled, createHdfsConfiguration()); - } - - private HdfsConfiguration createHdfsConfiguration() - { - ConfigurationInitializer s3Config = new TrinoS3ConfigurationInitializer(new HiveS3Config() - .setS3AwsAccessKey(awsAccessKey) - .setS3AwsSecretKey(awsSecretKey)); - HdfsConfigurationInitializer initializer = new HdfsConfigurationInitializer(new HdfsConfig(), ImmutableSet.of(s3Config)); - return new DynamicHdfsConfiguration(initializer, ImmutableSet.of()); - } - - @Override - protected Path getBasePath() - { - // HDP 3.1 does not understand s3:// out of the box. - return new Path(format("s3a://%s/%s/", writableBucket, testDirectory)); - } - - @Test - public void testIgnoreHadoopFolderMarker() - throws Exception - { - Path basePath = getBasePath(); - FileSystem fs = hdfsEnvironment.getFileSystem(TESTING_CONTEXT, basePath); - - String markerFileName = "test_table_$folder$"; - Path filePath = new Path(basePath, markerFileName); - fs.create(filePath).close(); - - assertFalse(Arrays.stream(fs.listStatus(basePath)).anyMatch(file -> file.getPath().getName().equalsIgnoreCase(markerFileName))); - } - - /** - * Tests the same functionality like {@link #testFileIteratorPartitionedListing()} with the - * setup done by native {@link AmazonS3} - */ - @Test - public void testFileIteratorPartitionedListingNativeS3Client() - throws Exception - { - Table.Builder tableBuilder = Table.builder() - .setDatabaseName(table.getSchemaName()) - .setTableName(table.getTableName()) - .setDataColumns(ImmutableList.of(new Column("data", HIVE_LONG, Optional.empty()))) - .setPartitionColumns(ImmutableList.of(new Column("part", HIVE_STRING, Optional.empty()))) - .setOwner(Optional.empty()) - .setTableType("fake"); - tableBuilder.getStorageBuilder() - .setStorageFormat(StorageFormat.fromHiveStorageFormat(HiveStorageFormat.CSV)); - Table fakeTable = tableBuilder.build(); - - Path basePath = new Path(getBasePath(), "test-file-iterator-partitioned-listing-native-setup"); - FileSystem fs = hdfsEnvironment.getFileSystem(TESTING_CONTEXT, basePath); - TrinoFileSystem trinoFileSystem = new HdfsFileSystemFactory(hdfsEnvironment, new TrinoHdfsFileSystemStats()).create(SESSION); - fs.mkdirs(basePath); - String basePrefix = basePath.toUri().getPath().substring(1); - - // Expected file system tree: - // test-file-iterator-partitioned-listing-native-setup/ - // .hidden/ - // nested-file-in-hidden.txt - // part=simple/ - // _hidden-file.txt - // plain-file.txt - // part=nested/ - // parent/ - // _nested-hidden-file.txt - // nested-file.txt - // part=plus+sign/ - // plus-file.txt - // part=percent%sign/ - // percent-file.txt - // part=url%20encoded/ - // url-encoded-file.txt - // part=level1|level2/ - // pipe-file.txt - // parent1/ - // parent2/ - // deeply-nested-file.txt - // part=level1 | level2/ - // pipe-blanks-file.txt - // empty-directory/ - // .hidden-in-base.txt - - createFile(writableBucket, format("%s/.hidden/nested-file-in-hidden.txt", basePrefix)); - createFile(writableBucket, format("%s/part=simple/_hidden-file.txt", basePrefix)); - createFile(writableBucket, format("%s/part=simple/plain-file.txt", basePrefix)); - createFile(writableBucket, format("%s/part=nested/parent/_nested-hidden-file.txt", basePrefix)); - createFile(writableBucket, format("%s/part=nested/parent/nested-file.txt", basePrefix)); - createFile(writableBucket, format("%s/part=plus+sign/plus-file.txt", basePrefix)); - createFile(writableBucket, format("%s/part=percent%%sign/percent-file.txt", basePrefix)); - createFile(writableBucket, format("%s/part=url%%20encoded/url-encoded-file.txt", basePrefix)); - createFile(writableBucket, format("%s/part=level1|level2/pipe-file.txt", basePrefix)); - createFile(writableBucket, format("%s/part=level1|level2/parent1/parent2/deeply-nested-file.txt", basePrefix)); - createFile(writableBucket, format("%s/part=level1 | level2/pipe-blanks-file.txt", basePrefix)); - createDirectory(writableBucket, format("%s/empty-directory/", basePrefix)); - createFile(writableBucket, format("%s/.hidden-in-base.txt", basePrefix)); - - // List recursively through hive file iterator - HiveFileIterator recursiveIterator = new HiveFileIterator( - fakeTable, - Location.of(basePath.toString()), - trinoFileSystem, - new FileSystemDirectoryLister(), - new HdfsNamenodeStats(), - HiveFileIterator.NestedDirectoryPolicy.RECURSE); - - List recursiveListing = Streams.stream(recursiveIterator) - .map(TrinoFileStatus::getPath) - .toList(); - // Should not include directories, or files underneath hidden directories - assertThat(recursiveListing).containsExactlyInAnyOrder( - format("%s/part=simple/plain-file.txt", basePath), - format("%s/part=nested/parent/nested-file.txt", basePath), - format("%s/part=plus+sign/plus-file.txt", basePath), - format("%s/part=percent%%sign/percent-file.txt", basePath), - format("%s/part=url%%20encoded/url-encoded-file.txt", basePath), - format("%s/part=level1|level2/pipe-file.txt", basePath), - format("%s/part=level1|level2/parent1/parent2/deeply-nested-file.txt", basePath), - format("%s/part=level1 | level2/pipe-blanks-file.txt", basePath)); - - HiveFileIterator shallowIterator = new HiveFileIterator( - fakeTable, - Location.of(basePath.toString()), - trinoFileSystem, - new FileSystemDirectoryLister(), - new HdfsNamenodeStats(), - HiveFileIterator.NestedDirectoryPolicy.IGNORED); - List shallowListing = Streams.stream(shallowIterator) - .map(TrinoFileStatus::getPath) - .map(Path::new) - .toList(); - // Should not include any hidden files, folders, or nested files - assertThat(shallowListing).isEmpty(); - } - - protected void createDirectory(String bucketName, String key) - { - // create meta-data for your folder and set content-length to 0 - ObjectMetadata metadata = new ObjectMetadata(); - metadata.setContentLength(0); - metadata.setContentType(DIRECTORY_MEDIA_TYPE.toString()); - // create a PutObjectRequest passing the folder name suffixed by / - if (!key.endsWith("/")) { - key += "/"; - } - PutObjectRequest putObjectRequest = new PutObjectRequest(bucketName, key, nullInputStream(), metadata); - // send request to S3 to create folder - s3Client.putObject(putObjectRequest); - } - - protected void createFile(String bucketName, String key) - { - ObjectMetadata metadata = new ObjectMetadata(); - metadata.setContentLength(0); - PutObjectRequest putObjectRequest = new PutObjectRequest(bucketName, key, nullInputStream(), metadata); - s3Client.putObject(putObjectRequest); - } -} diff --git a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemAdl.java b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemAdl.java index fad889d28fac5..b31c642964658 100644 --- a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemAdl.java +++ b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemAdl.java @@ -73,7 +73,7 @@ public void setup(String host, int port, String databaseName, String dataLakeNam this.refreshUrl = refreshUrl; this.testDirectory = testDirectory; - super.setup(host, port, databaseName, false, createHdfsConfiguration()); + super.setup(host, port, databaseName, createHdfsConfiguration()); } private HdfsConfiguration createHdfsConfiguration() diff --git a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemS3.java b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemS3.java index c522c25d6a104..21d9dda644066 100644 --- a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemS3.java +++ b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemS3.java @@ -13,12 +13,65 @@ */ package io.trino.plugin.hive; +import com.amazonaws.auth.AWSStaticCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.client.builder.AwsClientBuilder; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3Client; +import com.amazonaws.services.s3.model.ObjectMetadata; +import com.amazonaws.services.s3.model.PutObjectRequest; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Streams; +import com.google.common.net.MediaType; +import io.trino.filesystem.Location; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.filesystem.hdfs.HdfsFileSystemFactory; +import io.trino.hdfs.ConfigurationInitializer; +import io.trino.hdfs.DynamicHdfsConfiguration; +import io.trino.hdfs.HdfsConfig; +import io.trino.hdfs.HdfsConfiguration; +import io.trino.hdfs.HdfsConfigurationInitializer; +import io.trino.hdfs.HdfsNamenodeStats; +import io.trino.hdfs.TrinoHdfsFileSystemStats; +import io.trino.hdfs.s3.HiveS3Config; +import io.trino.hdfs.s3.TrinoS3ConfigurationInitializer; +import io.trino.plugin.hive.fs.FileSystemDirectoryLister; +import io.trino.plugin.hive.fs.HiveFileIterator; +import io.trino.plugin.hive.fs.TrinoFileStatus; +import io.trino.plugin.hive.metastore.Column; +import io.trino.plugin.hive.metastore.StorageFormat; +import io.trino.plugin.hive.metastore.Table; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.testng.annotations.BeforeClass; import org.testng.annotations.Parameters; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.plugin.hive.HiveTestUtils.SESSION; +import static io.trino.plugin.hive.HiveType.HIVE_LONG; +import static io.trino.plugin.hive.HiveType.HIVE_STRING; +import static java.io.InputStream.nullInputStream; +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; +import static org.testng.Assert.assertFalse; +import static org.testng.util.Strings.isNullOrEmpty; public class TestHiveFileSystemS3 - extends AbstractTestHiveFileSystemS3 + extends AbstractTestHiveFileSystem { + private static final MediaType DIRECTORY_MEDIA_TYPE = MediaType.create("application", "x-directory"); + private String awsAccessKey; + private String awsSecretKey; + private String writableBucket; + private String testDirectory; + private AmazonS3 s3Client; + @Parameters({ "hive.hadoop2.metastoreHost", "hive.hadoop2.metastorePort", @@ -32,6 +85,180 @@ public class TestHiveFileSystemS3 @BeforeClass public void setup(String host, int port, String databaseName, String s3endpoint, String awsAccessKey, String awsSecretKey, String writableBucket, String testDirectory) { - super.setup(host, port, databaseName, s3endpoint, awsAccessKey, awsSecretKey, writableBucket, testDirectory, false); + checkArgument(!isNullOrEmpty(host), "Expected non empty host"); + checkArgument(!isNullOrEmpty(databaseName), "Expected non empty databaseName"); + checkArgument(!isNullOrEmpty(awsAccessKey), "Expected non empty awsAccessKey"); + checkArgument(!isNullOrEmpty(awsSecretKey), "Expected non empty awsSecretKey"); + checkArgument(!isNullOrEmpty(s3endpoint), "Expected non empty s3endpoint"); + checkArgument(!isNullOrEmpty(writableBucket), "Expected non empty writableBucket"); + checkArgument(!isNullOrEmpty(testDirectory), "Expected non empty testDirectory"); + this.awsAccessKey = awsAccessKey; + this.awsSecretKey = awsSecretKey; + this.writableBucket = writableBucket; + this.testDirectory = testDirectory; + + s3Client = AmazonS3Client.builder() + .withEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration(s3endpoint, null)) + .withCredentials(new AWSStaticCredentialsProvider(new BasicAWSCredentials(awsAccessKey, awsSecretKey))) + .build(); + + setup(host, port, databaseName, createHdfsConfiguration()); + } + + private HdfsConfiguration createHdfsConfiguration() + { + ConfigurationInitializer s3Config = new TrinoS3ConfigurationInitializer(new HiveS3Config() + .setS3AwsAccessKey(awsAccessKey) + .setS3AwsSecretKey(awsSecretKey)); + HdfsConfigurationInitializer initializer = new HdfsConfigurationInitializer(new HdfsConfig(), ImmutableSet.of(s3Config)); + return new DynamicHdfsConfiguration(initializer, ImmutableSet.of()); + } + + @Override + protected Path getBasePath() + { + // HDP 3.1 does not understand s3:// out of the box. + return new Path(format("s3a://%s/%s/", writableBucket, testDirectory)); + } + + @Test + public void testIgnoreHadoopFolderMarker() + throws Exception + { + Path basePath = getBasePath(); + FileSystem fs = hdfsEnvironment.getFileSystem(TESTING_CONTEXT, basePath); + + String markerFileName = "test_table_$folder$"; + Path filePath = new Path(basePath, markerFileName); + fs.create(filePath).close(); + + assertFalse(Arrays.stream(fs.listStatus(basePath)).anyMatch(file -> file.getPath().getName().equalsIgnoreCase(markerFileName))); + } + + /** + * Tests the same functionality like {@link #testFileIteratorPartitionedListing()} with the + * setup done by native {@link AmazonS3} + */ + @Test + public void testFileIteratorPartitionedListingNativeS3Client() + throws Exception + { + Table.Builder tableBuilder = Table.builder() + .setDatabaseName(table.getSchemaName()) + .setTableName(table.getTableName()) + .setDataColumns(ImmutableList.of(new Column("data", HIVE_LONG, Optional.empty()))) + .setPartitionColumns(ImmutableList.of(new Column("part", HIVE_STRING, Optional.empty()))) + .setOwner(Optional.empty()) + .setTableType("fake"); + tableBuilder.getStorageBuilder() + .setStorageFormat(StorageFormat.fromHiveStorageFormat(HiveStorageFormat.CSV)); + Table fakeTable = tableBuilder.build(); + + Path basePath = new Path(getBasePath(), "test-file-iterator-partitioned-listing-native-setup"); + FileSystem fs = hdfsEnvironment.getFileSystem(TESTING_CONTEXT, basePath); + TrinoFileSystem trinoFileSystem = new HdfsFileSystemFactory(hdfsEnvironment, new TrinoHdfsFileSystemStats()).create(SESSION); + fs.mkdirs(basePath); + String basePrefix = basePath.toUri().getPath().substring(1); + + // Expected file system tree: + // test-file-iterator-partitioned-listing-native-setup/ + // .hidden/ + // nested-file-in-hidden.txt + // part=simple/ + // _hidden-file.txt + // plain-file.txt + // part=nested/ + // parent/ + // _nested-hidden-file.txt + // nested-file.txt + // part=plus+sign/ + // plus-file.txt + // part=percent%sign/ + // percent-file.txt + // part=url%20encoded/ + // url-encoded-file.txt + // part=level1|level2/ + // pipe-file.txt + // parent1/ + // parent2/ + // deeply-nested-file.txt + // part=level1 | level2/ + // pipe-blanks-file.txt + // empty-directory/ + // .hidden-in-base.txt + + createFile(writableBucket, format("%s/.hidden/nested-file-in-hidden.txt", basePrefix)); + createFile(writableBucket, format("%s/part=simple/_hidden-file.txt", basePrefix)); + createFile(writableBucket, format("%s/part=simple/plain-file.txt", basePrefix)); + createFile(writableBucket, format("%s/part=nested/parent/_nested-hidden-file.txt", basePrefix)); + createFile(writableBucket, format("%s/part=nested/parent/nested-file.txt", basePrefix)); + createFile(writableBucket, format("%s/part=plus+sign/plus-file.txt", basePrefix)); + createFile(writableBucket, format("%s/part=percent%%sign/percent-file.txt", basePrefix)); + createFile(writableBucket, format("%s/part=url%%20encoded/url-encoded-file.txt", basePrefix)); + createFile(writableBucket, format("%s/part=level1|level2/pipe-file.txt", basePrefix)); + createFile(writableBucket, format("%s/part=level1|level2/parent1/parent2/deeply-nested-file.txt", basePrefix)); + createFile(writableBucket, format("%s/part=level1 | level2/pipe-blanks-file.txt", basePrefix)); + createDirectory(writableBucket, format("%s/empty-directory/", basePrefix)); + createFile(writableBucket, format("%s/.hidden-in-base.txt", basePrefix)); + + // List recursively through hive file iterator + HiveFileIterator recursiveIterator = new HiveFileIterator( + fakeTable, + Location.of(basePath.toString()), + trinoFileSystem, + new FileSystemDirectoryLister(), + new HdfsNamenodeStats(), + HiveFileIterator.NestedDirectoryPolicy.RECURSE); + + List recursiveListing = Streams.stream(recursiveIterator) + .map(TrinoFileStatus::getPath) + .toList(); + // Should not include directories, or files underneath hidden directories + assertThat(recursiveListing).containsExactlyInAnyOrder( + format("%s/part=simple/plain-file.txt", basePath), + format("%s/part=nested/parent/nested-file.txt", basePath), + format("%s/part=plus+sign/plus-file.txt", basePath), + format("%s/part=percent%%sign/percent-file.txt", basePath), + format("%s/part=url%%20encoded/url-encoded-file.txt", basePath), + format("%s/part=level1|level2/pipe-file.txt", basePath), + format("%s/part=level1|level2/parent1/parent2/deeply-nested-file.txt", basePath), + format("%s/part=level1 | level2/pipe-blanks-file.txt", basePath)); + + HiveFileIterator shallowIterator = new HiveFileIterator( + fakeTable, + Location.of(basePath.toString()), + trinoFileSystem, + new FileSystemDirectoryLister(), + new HdfsNamenodeStats(), + HiveFileIterator.NestedDirectoryPolicy.IGNORED); + List shallowListing = Streams.stream(shallowIterator) + .map(TrinoFileStatus::getPath) + .map(Path::new) + .toList(); + // Should not include any hidden files, folders, or nested files + assertThat(shallowListing).isEmpty(); + } + + protected void createDirectory(String bucketName, String key) + { + // create meta-data for your folder and set content-length to 0 + ObjectMetadata metadata = new ObjectMetadata(); + metadata.setContentLength(0); + metadata.setContentType(DIRECTORY_MEDIA_TYPE.toString()); + // create a PutObjectRequest passing the folder name suffixed by / + if (!key.endsWith("/")) { + key += "/"; + } + PutObjectRequest putObjectRequest = new PutObjectRequest(bucketName, key, nullInputStream(), metadata); + // send request to S3 to create folder + s3Client.putObject(putObjectRequest); + } + + protected void createFile(String bucketName, String key) + { + ObjectMetadata metadata = new ObjectMetadata(); + metadata.setContentLength(0); + PutObjectRequest putObjectRequest = new PutObjectRequest(bucketName, key, nullInputStream(), metadata); + s3Client.putObject(putObjectRequest); } } diff --git a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemWasb.java b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemWasb.java index 3ac98d86636a8..cab2e698f4c98 100644 --- a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemWasb.java +++ b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/TestHiveFileSystemWasb.java @@ -61,7 +61,7 @@ public void setup(String host, int port, String databaseName, String container, this.accessKey = accessKey; this.testDirectory = testDirectory; - super.setup(host, port, databaseName, false, createHdfsConfiguration()); + super.setup(host, port, databaseName, createHdfsConfiguration()); } private HdfsConfiguration createHdfsConfiguration() diff --git a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/S3SelectTestHelper.java b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/S3SelectTestHelper.java deleted file mode 100644 index 00a8d48dbe0f4..0000000000000 --- a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/S3SelectTestHelper.java +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.net.HostAndPort; -import io.airlift.concurrent.BoundedExecutor; -import io.airlift.json.JsonCodec; -import io.airlift.stats.CounterStat; -import io.trino.filesystem.hdfs.HdfsFileSystemFactory; -import io.trino.hdfs.ConfigurationInitializer; -import io.trino.hdfs.DynamicHdfsConfiguration; -import io.trino.hdfs.HdfsConfig; -import io.trino.hdfs.HdfsConfiguration; -import io.trino.hdfs.HdfsConfigurationInitializer; -import io.trino.hdfs.HdfsEnvironment; -import io.trino.hdfs.HdfsNamenodeStats; -import io.trino.hdfs.authentication.NoHdfsAuthentication; -import io.trino.hdfs.s3.HiveS3Config; -import io.trino.hdfs.s3.TrinoS3ConfigurationInitializer; -import io.trino.plugin.base.CatalogName; -import io.trino.plugin.hive.AbstractTestHiveFileSystem.TestingHiveMetastore; -import io.trino.plugin.hive.DefaultHiveMaterializedViewMetadataFactory; -import io.trino.plugin.hive.GenericHiveRecordCursorProvider; -import io.trino.plugin.hive.HiveConfig; -import io.trino.plugin.hive.HiveLocationService; -import io.trino.plugin.hive.HiveMetadataFactory; -import io.trino.plugin.hive.HivePageSourceProvider; -import io.trino.plugin.hive.HivePartitionManager; -import io.trino.plugin.hive.HiveSplitManager; -import io.trino.plugin.hive.HiveTransactionManager; -import io.trino.plugin.hive.LocationService; -import io.trino.plugin.hive.NodeVersion; -import io.trino.plugin.hive.NoneHiveRedirectionsProvider; -import io.trino.plugin.hive.PartitionUpdate; -import io.trino.plugin.hive.PartitionsSystemTableProvider; -import io.trino.plugin.hive.PropertiesSystemTableProvider; -import io.trino.plugin.hive.aws.athena.PartitionProjectionService; -import io.trino.plugin.hive.fs.FileSystemDirectoryLister; -import io.trino.plugin.hive.fs.TransactionScopeCachingDirectoryListerFactory; -import io.trino.plugin.hive.metastore.HiveMetastoreConfig; -import io.trino.plugin.hive.metastore.HiveMetastoreFactory; -import io.trino.plugin.hive.metastore.thrift.BridgingHiveMetastore; -import io.trino.plugin.hive.security.SqlStandardAccessControlMetadata; -import io.trino.spi.connector.ColumnHandle; -import io.trino.spi.connector.ConnectorPageSourceProvider; -import io.trino.spi.connector.ConnectorSession; -import io.trino.spi.connector.ConnectorSplitManager; -import io.trino.spi.connector.SchemaTableName; -import io.trino.spi.type.TestingTypeManager; -import io.trino.testing.MaterializedResult; -import org.apache.hadoop.fs.Path; - -import java.io.IOException; -import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.ScheduledExecutorService; -import java.util.stream.LongStream; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.util.concurrent.MoreExecutors.newDirectExecutorService; -import static io.airlift.concurrent.Threads.daemonThreadsNamed; -import static io.trino.plugin.hive.HiveFileSystemTestUtils.filterTable; -import static io.trino.plugin.hive.HiveFileSystemTestUtils.getSplitsCount; -import static io.trino.plugin.hive.HiveTestUtils.HDFS_FILE_SYSTEM_STATS; -import static io.trino.plugin.hive.HiveTestUtils.getDefaultHiveFileWriterFactories; -import static io.trino.plugin.hive.HiveTestUtils.getDefaultHivePageSourceFactories; -import static io.trino.plugin.hive.HiveTestUtils.getDefaultHiveRecordCursorProviders; -import static io.trino.plugin.hive.TestingThriftHiveMetastoreBuilder.testingThriftHiveMetastoreBuilder; -import static io.trino.spi.connector.MetadataProvider.NOOP_METADATA_PROVIDER; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER; -import static java.lang.String.format; -import static java.util.concurrent.Executors.newCachedThreadPool; -import static java.util.concurrent.Executors.newScheduledThreadPool; -import static org.testng.util.Strings.isNullOrEmpty; - -public class S3SelectTestHelper -{ - private HdfsEnvironment hdfsEnvironment; - private LocationService locationService; - private TestingHiveMetastore metastoreClient; - private HiveMetadataFactory metadataFactory; - private HiveTransactionManager transactionManager; - private ConnectorSplitManager splitManager; - private ConnectorPageSourceProvider pageSourceProvider; - - private ExecutorService executorService; - private HiveConfig hiveConfig; - private ScheduledExecutorService heartbeatService; - - public S3SelectTestHelper(String host, - int port, - String databaseName, - String awsAccessKey, - String awsSecretKey, - String writableBucket, - String testDirectory, - HiveConfig hiveConfig) - { - checkArgument(!isNullOrEmpty(host), "Expected non empty host"); - checkArgument(!isNullOrEmpty(databaseName), "Expected non empty databaseName"); - checkArgument(!isNullOrEmpty(awsAccessKey), "Expected non empty awsAccessKey"); - checkArgument(!isNullOrEmpty(awsSecretKey), "Expected non empty awsSecretKey"); - checkArgument(!isNullOrEmpty(writableBucket), "Expected non empty writableBucket"); - checkArgument(!isNullOrEmpty(testDirectory), "Expected non empty testDirectory"); - - executorService = newCachedThreadPool(daemonThreadsNamed("s3select-tests-%s")); - heartbeatService = newScheduledThreadPool(1); - - ConfigurationInitializer s3Config = new TrinoS3ConfigurationInitializer(new HiveS3Config() - .setS3AwsAccessKey(awsAccessKey) - .setS3AwsSecretKey(awsSecretKey)); - HdfsConfigurationInitializer initializer = new HdfsConfigurationInitializer(new HdfsConfig(), ImmutableSet.of(s3Config)); - HdfsConfiguration hdfsConfiguration = new DynamicHdfsConfiguration(initializer, ImmutableSet.of()); - - this.hiveConfig = hiveConfig; - HivePartitionManager hivePartitionManager = new HivePartitionManager(this.hiveConfig); - - hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, new HdfsConfig(), new NoHdfsAuthentication()); - locationService = new HiveLocationService(hdfsEnvironment, hiveConfig); - JsonCodec partitionUpdateCodec = JsonCodec.jsonCodec(PartitionUpdate.class); - - metastoreClient = new TestingHiveMetastore( - new BridgingHiveMetastore( - testingThriftHiveMetastoreBuilder() - .metastoreClient(HostAndPort.fromParts(host, port)) - .hiveConfig(this.hiveConfig) - .hdfsEnvironment(hdfsEnvironment) - .build()), - new Path(format("s3a://%s/%s/", writableBucket, testDirectory)), - hdfsEnvironment); - metadataFactory = new HiveMetadataFactory( - new CatalogName("hive"), - this.hiveConfig, - new HiveMetastoreConfig(), - HiveMetastoreFactory.ofInstance(metastoreClient), - getDefaultHiveFileWriterFactories(hiveConfig, hdfsEnvironment), - new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), - hdfsEnvironment, - hivePartitionManager, - newDirectExecutorService(), - heartbeatService, - TESTING_TYPE_MANAGER, - NOOP_METADATA_PROVIDER, - locationService, - partitionUpdateCodec, - new NodeVersion("test_version"), - new NoneHiveRedirectionsProvider(), - ImmutableSet.of( - new PartitionsSystemTableProvider(hivePartitionManager, TESTING_TYPE_MANAGER), - new PropertiesSystemTableProvider()), - new DefaultHiveMaterializedViewMetadataFactory(), - SqlStandardAccessControlMetadata::new, - new FileSystemDirectoryLister(), - new TransactionScopeCachingDirectoryListerFactory(hiveConfig), - new PartitionProjectionService(this.hiveConfig, ImmutableMap.of(), new TestingTypeManager()), - true); - transactionManager = new HiveTransactionManager(metadataFactory); - - splitManager = new HiveSplitManager( - transactionManager, - hivePartitionManager, - new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), - new HdfsNamenodeStats(), - hdfsEnvironment, - new BoundedExecutor(executorService, this.hiveConfig.getMaxSplitIteratorThreads()), - new CounterStat(), - this.hiveConfig.getMaxOutstandingSplits(), - this.hiveConfig.getMaxOutstandingSplitsSize(), - this.hiveConfig.getMinPartitionBatchSize(), - this.hiveConfig.getMaxPartitionBatchSize(), - this.hiveConfig.getMaxInitialSplits(), - this.hiveConfig.getSplitLoaderConcurrency(), - this.hiveConfig.getMaxSplitsPerSecond(), - this.hiveConfig.getRecursiveDirWalkerEnabled(), - TESTING_TYPE_MANAGER, - this.hiveConfig.getMaxPartitionsPerScan()); - - pageSourceProvider = new HivePageSourceProvider( - TESTING_TYPE_MANAGER, - hdfsEnvironment, - this.hiveConfig, - getDefaultHivePageSourceFactories(hdfsEnvironment, this.hiveConfig), - getDefaultHiveRecordCursorProviders(this.hiveConfig, hdfsEnvironment), - new GenericHiveRecordCursorProvider(hdfsEnvironment, this.hiveConfig)); - } - - public S3SelectTestHelper(String host, - int port, - String databaseName, - String awsAccessKey, - String awsSecretKey, - String writableBucket, - String testDirectory) - { - this(host, port, databaseName, awsAccessKey, awsSecretKey, writableBucket, testDirectory, new HiveConfig().setS3SelectPushdownEnabled(true)); - } - - public HiveTransactionManager getTransactionManager() - { - return transactionManager; - } - - public ConnectorSplitManager getSplitManager() - { - return splitManager; - } - - public ConnectorPageSourceProvider getPageSourceProvider() - { - return pageSourceProvider; - } - - public HiveConfig getHiveConfig() - { - return hiveConfig; - } - - public void tearDown() - { - hdfsEnvironment = null; - locationService = null; - metastoreClient = null; - metadataFactory = null; - transactionManager = null; - splitManager = null; - pageSourceProvider = null; - hiveConfig = null; - if (executorService != null) { - executorService.shutdownNow(); - executorService = null; - } - if (heartbeatService != null) { - heartbeatService.shutdownNow(); - heartbeatService = null; - } - } - - int getTableSplitsCount(SchemaTableName table) - { - return getSplitsCount( - table, - getTransactionManager(), - getHiveConfig(), - getSplitManager()); - } - - MaterializedResult getFilteredTableResult(SchemaTableName table, ColumnHandle column) - { - try { - return filterTable( - table, - List.of(column), - getTransactionManager(), - getHiveConfig(), - getPageSourceProvider(), - getSplitManager()); - } - catch (IOException ignored) { - } - - return null; - } - - static MaterializedResult expectedResult(ConnectorSession session, int start, int end) - { - MaterializedResult.Builder builder = MaterializedResult.resultBuilder(session, BIGINT); - LongStream.rangeClosed(start, end).forEach(builder::row); - return builder.build(); - } - - static boolean isSplitCountInOpenInterval(int splitCount, - int lowerBound, - int upperBound) - { - // Split number may vary, the minimum number of splits being obtained with - // the first split of maxInitialSplitSize and the rest of maxSplitSize - return lowerBound < splitCount && splitCount < upperBound; - } -} diff --git a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectCsvPushdownWithSplits.java b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectCsvPushdownWithSplits.java deleted file mode 100644 index 2edc5bd71f0bd..0000000000000 --- a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectCsvPushdownWithSplits.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import io.airlift.units.DataSize; -import io.trino.plugin.hive.HiveConfig; -import io.trino.spi.connector.ColumnHandle; -import io.trino.spi.connector.SchemaTableName; -import io.trino.testing.MaterializedResult; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Parameters; -import org.testng.annotations.Test; - -import java.util.Optional; - -import static io.airlift.units.DataSize.Unit.KILOBYTE; -import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; -import static io.trino.plugin.hive.HiveColumnHandle.createBaseColumn; -import static io.trino.plugin.hive.HiveFileSystemTestUtils.newSession; -import static io.trino.plugin.hive.HiveType.HIVE_INT; -import static io.trino.plugin.hive.s3select.S3SelectTestHelper.expectedResult; -import static io.trino.plugin.hive.s3select.S3SelectTestHelper.isSplitCountInOpenInterval; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder; -import static org.testng.Assert.assertTrue; - -public class TestHiveFileSystemS3SelectCsvPushdownWithSplits -{ - private String host; - private int port; - private String databaseName; - private String awsAccessKey; - private String awsSecretKey; - private String writableBucket; - private String testDirectory; - - private SchemaTableName tableCsvWithSplits; - - @Parameters({ - "hive.hadoop2.metastoreHost", - "hive.hadoop2.metastorePort", - "hive.hadoop2.databaseName", - "hive.hadoop2.s3.awsAccessKey", - "hive.hadoop2.s3.awsSecretKey", - "hive.hadoop2.s3.writableBucket", - "hive.hadoop2.s3.testDirectory", - }) - @BeforeClass - public void setup(String host, int port, String databaseName, String awsAccessKey, String awsSecretKey, String writableBucket, String testDirectory) - { - this.host = host; - this.port = port; - this.databaseName = databaseName; - this.awsAccessKey = awsAccessKey; - this.awsSecretKey = awsSecretKey; - this.writableBucket = writableBucket; - this.testDirectory = testDirectory; - - tableCsvWithSplits = new SchemaTableName(databaseName, "trino_s3select_test_csv_scan_range_pushdown"); - } - - @DataProvider(name = "testSplitSize") - public static Object[][] splitSizeParametersProvider() - { - return new Object[][] {{3, 2, 15, 30}, {50, 30, 2, 4}}; - } - - @Test(dataProvider = "testSplitSize") - public void testQueryPushdownWithSplitSizeForCsv(int maxSplitSizeKB, - int maxInitialSplitSizeKB, - int minSplitCount, - int maxSplitCount) - { - S3SelectTestHelper s3SelectTestHelper = null; - try { - HiveConfig hiveConfig = new HiveConfig() - .setS3SelectPushdownEnabled(true) - .setMaxSplitSize(DataSize.of(maxSplitSizeKB, KILOBYTE)) - .setMaxInitialSplitSize(DataSize.of(maxInitialSplitSizeKB, KILOBYTE)); - s3SelectTestHelper = new S3SelectTestHelper( - host, - port, - databaseName, - awsAccessKey, - awsSecretKey, - writableBucket, - testDirectory, - hiveConfig); - - int tableSplitsCount = s3SelectTestHelper.getTableSplitsCount(tableCsvWithSplits); - assertTrue(isSplitCountInOpenInterval(tableSplitsCount, minSplitCount, maxSplitCount)); - - ColumnHandle indexColumn = createBaseColumn("index", 0, HIVE_INT, BIGINT, REGULAR, Optional.empty()); - MaterializedResult filteredTableResult = s3SelectTestHelper.getFilteredTableResult(tableCsvWithSplits, indexColumn); - assertEqualsIgnoreOrder(filteredTableResult, - expectedResult(newSession(s3SelectTestHelper.getHiveConfig()), 1, 300)); - } - finally { - if (s3SelectTestHelper != null) { - s3SelectTestHelper.tearDown(); - } - } - } -} diff --git a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectJsonPushdown.java b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectJsonPushdown.java deleted file mode 100644 index 260d03608d2c6..0000000000000 --- a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectJsonPushdown.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.google.common.collect.ImmutableList; -import io.trino.spi.connector.ColumnHandle; -import io.trino.spi.connector.SchemaTableName; -import io.trino.testing.MaterializedResult; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Parameters; -import org.testng.annotations.Test; - -import java.util.List; -import java.util.Optional; - -import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; -import static io.trino.plugin.hive.HiveColumnHandle.createBaseColumn; -import static io.trino.plugin.hive.HiveFileSystemTestUtils.filterTable; -import static io.trino.plugin.hive.HiveFileSystemTestUtils.newSession; -import static io.trino.plugin.hive.HiveFileSystemTestUtils.readTable; -import static io.trino.plugin.hive.HiveType.HIVE_INT; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder; - -public class TestHiveFileSystemS3SelectJsonPushdown -{ - private SchemaTableName tableJson; - - private S3SelectTestHelper s3SelectTestHelper; - - @Parameters({ - "hive.hadoop2.metastoreHost", - "hive.hadoop2.metastorePort", - "hive.hadoop2.databaseName", - "hive.hadoop2.s3.awsAccessKey", - "hive.hadoop2.s3.awsSecretKey", - "hive.hadoop2.s3.writableBucket", - "hive.hadoop2.s3.testDirectory", - }) - @BeforeClass - public void setup(String host, int port, String databaseName, String awsAccessKey, String awsSecretKey, String writableBucket, String testDirectory) - { - s3SelectTestHelper = new S3SelectTestHelper(host, port, databaseName, awsAccessKey, awsSecretKey, writableBucket, testDirectory); - tableJson = new SchemaTableName(databaseName, "trino_s3select_test_external_fs_json"); - } - - @Test - public void testGetRecordsJson() - throws Exception - { - assertEqualsIgnoreOrder( - readTable(tableJson, - s3SelectTestHelper.getTransactionManager(), - s3SelectTestHelper.getHiveConfig(), - s3SelectTestHelper.getPageSourceProvider(), - s3SelectTestHelper.getSplitManager()), - MaterializedResult.resultBuilder(newSession(s3SelectTestHelper.getHiveConfig()), BIGINT, BIGINT) - .row(2L, 4L).row(5L, 6L) // test_table.json - .row(7L, 23L).row(28L, 22L).row(13L, 10L) // test_table.json.gz - .row(1L, 19L).row(6L, 3L).row(24L, 22L).row(100L, 77L) // test_table.json.bz2 - .build()); - } - - @Test - public void testFilterRecordsJson() - throws Exception - { - List projectedColumns = ImmutableList.of( - createBaseColumn("col_1", 0, HIVE_INT, BIGINT, REGULAR, Optional.empty())); - - assertEqualsIgnoreOrder( - filterTable(tableJson, - projectedColumns, - s3SelectTestHelper.getTransactionManager(), - s3SelectTestHelper.getHiveConfig(), - s3SelectTestHelper.getPageSourceProvider(), - s3SelectTestHelper.getSplitManager()), - MaterializedResult.resultBuilder(newSession(s3SelectTestHelper.getHiveConfig()), BIGINT) - .row(2L).row(5L) // test_table.json - .row(7L).row(28L).row(13L) // test_table.json.gz - .row(1L).row(6L).row(24L).row(100L) // test_table.json.bz2 - .build()); - } - - @AfterClass(alwaysRun = true) - public void tearDown() - { - s3SelectTestHelper.tearDown(); - } -} diff --git a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectJsonPushdownWithSplits.java b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectJsonPushdownWithSplits.java deleted file mode 100644 index 1998ec9368daf..0000000000000 --- a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectJsonPushdownWithSplits.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import io.airlift.units.DataSize; -import io.trino.plugin.hive.HiveConfig; -import io.trino.spi.connector.ColumnHandle; -import io.trino.spi.connector.SchemaTableName; -import io.trino.testing.MaterializedResult; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Parameters; -import org.testng.annotations.Test; - -import java.util.Optional; - -import static io.airlift.units.DataSize.Unit.KILOBYTE; -import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; -import static io.trino.plugin.hive.HiveColumnHandle.createBaseColumn; -import static io.trino.plugin.hive.HiveFileSystemTestUtils.newSession; -import static io.trino.plugin.hive.HiveType.HIVE_INT; -import static io.trino.plugin.hive.s3select.S3SelectTestHelper.expectedResult; -import static io.trino.plugin.hive.s3select.S3SelectTestHelper.isSplitCountInOpenInterval; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder; -import static org.testng.Assert.assertTrue; - -public class TestHiveFileSystemS3SelectJsonPushdownWithSplits -{ - private String host; - private int port; - private String databaseName; - private String awsAccessKey; - private String awsSecretKey; - private String writableBucket; - private String testDirectory; - - private SchemaTableName tableJsonWithSplits; - - @Parameters({ - "hive.hadoop2.metastoreHost", - "hive.hadoop2.metastorePort", - "hive.hadoop2.databaseName", - "hive.hadoop2.s3.awsAccessKey", - "hive.hadoop2.s3.awsSecretKey", - "hive.hadoop2.s3.writableBucket", - "hive.hadoop2.s3.testDirectory", - }) - @BeforeClass - public void setup(String host, int port, String databaseName, String awsAccessKey, String awsSecretKey, String writableBucket, String testDirectory) - { - this.host = host; - this.port = port; - this.databaseName = databaseName; - this.awsAccessKey = awsAccessKey; - this.awsSecretKey = awsSecretKey; - this.writableBucket = writableBucket; - this.testDirectory = testDirectory; - - this.tableJsonWithSplits = new SchemaTableName(databaseName, "trino_s3select_test_json_scan_range_pushdown"); - } - - @DataProvider(name = "testSplitSize") - public static Object[][] splitSizeParametersProvider() - { - return new Object[][] {{15, 10, 6, 12}, {50, 30, 2, 4}}; - } - - @Test(dataProvider = "testSplitSize") - public void testQueryPushdownWithSplitSizeForJson(int maxSplitSizeKB, - int maxInitialSplitSizeKB, - int minSplitCount, - int maxSplitCount) - { - S3SelectTestHelper s3SelectTestHelper = null; - try { - HiveConfig hiveConfig = new HiveConfig() - .setS3SelectPushdownEnabled(true) - .setMaxSplitSize(DataSize.of(maxSplitSizeKB, KILOBYTE)) - .setMaxInitialSplitSize(DataSize.of(maxInitialSplitSizeKB, KILOBYTE)); - s3SelectTestHelper = new S3SelectTestHelper( - host, - port, - databaseName, - awsAccessKey, - awsSecretKey, - writableBucket, - testDirectory, - hiveConfig); - - int tableSplitsCount = s3SelectTestHelper.getTableSplitsCount(tableJsonWithSplits); - assertTrue(isSplitCountInOpenInterval(tableSplitsCount, minSplitCount, maxSplitCount)); - - ColumnHandle indexColumn = createBaseColumn("col_1", 0, HIVE_INT, BIGINT, REGULAR, Optional.empty()); - MaterializedResult filteredTableResult = s3SelectTestHelper.getFilteredTableResult(tableJsonWithSplits, indexColumn); - assertEqualsIgnoreOrder(filteredTableResult, - expectedResult(newSession(s3SelectTestHelper.getHiveConfig()), 1, 300)); - } - finally { - if (s3SelectTestHelper != null) { - s3SelectTestHelper.tearDown(); - } - } - } -} diff --git a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectPushdown.java b/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectPushdown.java deleted file mode 100644 index eef3f86a1ad53..0000000000000 --- a/plugin/trino-hive-hadoop2/src/test/java/io/trino/plugin/hive/s3select/TestHiveFileSystemS3SelectPushdown.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.google.common.collect.ImmutableList; -import io.trino.plugin.hive.AbstractTestHiveFileSystemS3; -import io.trino.spi.connector.ColumnHandle; -import io.trino.spi.connector.SchemaTableName; -import io.trino.testing.MaterializedResult; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Parameters; -import org.testng.annotations.Test; - -import java.util.List; -import java.util.Optional; - -import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; -import static io.trino.plugin.hive.HiveColumnHandle.createBaseColumn; -import static io.trino.plugin.hive.HiveType.HIVE_INT; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder; - -public class TestHiveFileSystemS3SelectPushdown - extends AbstractTestHiveFileSystemS3 -{ - protected SchemaTableName tableWithPipeDelimiter; - protected SchemaTableName tableWithCommaDelimiter; - - @Parameters({ - "hive.hadoop2.metastoreHost", - "hive.hadoop2.metastorePort", - "hive.hadoop2.databaseName", - "hive.hadoop2.s3.endpoint", - "hive.hadoop2.s3.awsAccessKey", - "hive.hadoop2.s3.awsSecretKey", - "hive.hadoop2.s3.writableBucket", - "hive.hadoop2.s3.testDirectory", - }) - @BeforeClass - public void setup(String host, int port, String databaseName, String s3endpoint, String awsAccessKey, String awsSecretKey, String writableBucket, String testDirectory) - { - super.setup(host, port, databaseName, s3endpoint, awsAccessKey, awsSecretKey, writableBucket, testDirectory, true); - tableWithPipeDelimiter = new SchemaTableName(database, "trino_s3select_test_external_fs_with_pipe_delimiter"); - tableWithCommaDelimiter = new SchemaTableName(database, "trino_s3select_test_external_fs_with_comma_delimiter"); - } - - @Test - public void testGetRecordsWithPipeDelimiter() - throws Exception - { - assertEqualsIgnoreOrder( - readTable(tableWithPipeDelimiter), - MaterializedResult.resultBuilder(newSession(), BIGINT, BIGINT) - .row(1L, 2L).row(3L, 4L).row(55L, 66L) // test_table_with_pipe_delimiter.csv - .row(27L, 10L).row(8L, 2L).row(456L, 789L) // test_table_with_pipe_delimiter.csv.gzip - .row(22L, 11L).row(78L, 76L).row(1L, 2L).row(36L, 90L) // test_table_with_pipe_delimiter.csv.bz2 - .build()); - } - - @Test - public void testFilterRecordsWithPipeDelimiter() - throws Exception - { - List projectedColumns = ImmutableList.of( - createBaseColumn("t_bigint", 0, HIVE_INT, BIGINT, REGULAR, Optional.empty())); - - assertEqualsIgnoreOrder( - filterTable(tableWithPipeDelimiter, projectedColumns), - MaterializedResult.resultBuilder(newSession(), BIGINT) - .row(1L).row(3L).row(55L) // test_table_with_pipe_delimiter.csv - .row(27L).row(8L).row(456L) // test_table_with_pipe_delimiter.csv.gzip - .row(22L).row(78L).row(1L).row(36L) // test_table_with_pipe_delimiter.csv.bz2 - .build()); - } - - @Test - public void testGetRecordsWithCommaDelimiter() - throws Exception - { - assertEqualsIgnoreOrder( - readTable(tableWithCommaDelimiter), - MaterializedResult.resultBuilder(newSession(), BIGINT, BIGINT) - .row(7L, 1L).row(19L, 10L).row(1L, 345L) // test_table_with_comma_delimiter.csv - .row(27L, 10L).row(28L, 9L).row(90L, 94L) // test_table_with_comma_delimiter.csv.gzip - .row(11L, 24L).row(1L, 6L).row(21L, 12L).row(0L, 0L) // test_table_with_comma_delimiter.csv.bz2 - .build()); - } - - @Test - public void testFilterRecordsWithCommaDelimiter() - throws Exception - { - List projectedColumns = ImmutableList.of( - createBaseColumn("t_bigint", 0, HIVE_INT, BIGINT, REGULAR, Optional.empty())); - - assertEqualsIgnoreOrder( - filterTable(tableWithCommaDelimiter, projectedColumns), - MaterializedResult.resultBuilder(newSession(), BIGINT) - .row(7L).row(19L).row(1L) // test_table_with_comma_delimiter.csv - .row(27L).row(28L).row(90L) // test_table_with_comma_delimiter.csv.gzip - .row(11L).row(1L).row(21L).row(0L) // test_table_with_comma_delimiter.csv.bz2 - .build()); - } -} diff --git a/plugin/trino-hive/pom.xml b/plugin/trino-hive/pom.xml index 4e942f9710c90..644c749499e3b 100644 --- a/plugin/trino-hive/pom.xml +++ b/plugin/trino-hive/pom.xml @@ -36,11 +36,6 @@ aws-java-sdk-glue - - com.amazonaws - aws-java-sdk-s3 - - com.amazonaws aws-java-sdk-sts @@ -76,11 +71,6 @@ failsafe - - io.airlift - aircompressor - - io.airlift bootstrap @@ -196,11 +186,6 @@ hadoop-apache - - io.trino.hive - hive-apache - - io.trino.hive hive-thrift @@ -231,21 +216,6 @@ avro - - org.apache.avro - avro-mapred - - - org.apache.avro - avro-ipc - - - org.apache.avro - avro-ipc-jetty - - - - org.apache.parquet parquet-column @@ -311,6 +281,12 @@ provided + + com.amazonaws + aws-java-sdk-s3 + runtime + + io.airlift log-manager @@ -458,6 +434,12 @@ test + + io.trino.hive + hive-apache + test + + io.trino.tpch tpch @@ -551,7 +533,6 @@ **/TestHiveGlueMetastore.java **/TestHiveS3AndGlueMetastoreTest.java **/TestTrinoS3FileSystemAwsS3.java - **/TestS3SelectQueries.java **/TestFullParquetReader.java **/TestParquetReader.java **/Test*FailureRecoveryTest.java @@ -609,7 +590,6 @@ **/TestHiveGlueMetastore.java **/TestHiveS3AndGlueMetastoreTest.java **/TestTrinoS3FileSystemAwsS3.java - **/TestS3SelectQueries.java diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java index a3f09bce745a0..f15040a8eb55c 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/BackgroundHiveSplitLoader.java @@ -18,17 +18,17 @@ import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableList; import com.google.common.collect.ListMultimap; +import com.google.common.collect.Multimaps; import com.google.common.collect.Streams; import com.google.common.io.CharStreams; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import io.airlift.units.Duration; import io.trino.filesystem.FileEntry; +import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoFileSystemFactory; -import io.trino.hdfs.HdfsContext; -import io.trino.hdfs.HdfsEnvironment; import io.trino.hdfs.HdfsNamenodeStats; import io.trino.plugin.hive.HiveSplit.BucketConversion; import io.trino.plugin.hive.HiveSplit.BucketValidation; @@ -39,7 +39,6 @@ import io.trino.plugin.hive.metastore.Partition; import io.trino.plugin.hive.metastore.StorageFormat; import io.trino.plugin.hive.metastore.Table; -import io.trino.plugin.hive.s3select.S3SelectPushdown; import io.trino.plugin.hive.util.AcidTables.AcidState; import io.trino.plugin.hive.util.AcidTables.ParsedDelta; import io.trino.plugin.hive.util.HiveBucketing.BucketingVersion; @@ -54,24 +53,10 @@ import io.trino.spi.connector.DynamicFilter; import io.trino.spi.predicate.TupleDomain; import io.trino.spi.type.TypeManager; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.JobConfigurable; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.hadoop.mapreduce.MRConfig; -import org.apache.hadoop.util.StringUtils; - -import java.io.BufferedReader; + import java.io.IOException; import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.security.Principal; +import java.io.Reader; import java.util.ArrayList; import java.util.Arrays; import java.util.Deque; @@ -79,10 +64,10 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Optional; import java.util.OptionalInt; import java.util.Properties; -import java.util.Set; import java.util.concurrent.ConcurrentLinkedDeque; import java.util.concurrent.Executor; import java.util.concurrent.atomic.AtomicInteger; @@ -98,13 +83,10 @@ import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; -import static com.google.common.collect.ImmutableSet.toImmutableSet; -import static com.google.common.collect.Iterables.getOnlyElement; import static com.google.common.util.concurrent.Futures.immediateVoidFuture; import static com.google.common.util.concurrent.MoreExecutors.directExecutor; import static io.airlift.concurrent.MoreFutures.addExceptionCallback; import static io.airlift.concurrent.MoreFutures.toListenableFuture; -import static io.trino.hdfs.ConfigurationUtils.toJobConf; import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; import static io.trino.plugin.hive.HiveErrorCode.HIVE_EXCEEDED_PARTITION_LIMIT; import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; @@ -129,7 +111,6 @@ import static io.trino.plugin.hive.util.AcidTables.isTransactionalTable; import static io.trino.plugin.hive.util.AcidTables.readAcidVersionFile; import static io.trino.plugin.hive.util.HiveClassNames.SYMLINK_TEXT_INPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.HiveReaderUtil.getInputFormat; import static io.trino.plugin.hive.util.HiveUtil.checkCondition; import static io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName; import static io.trino.plugin.hive.util.HiveUtil.getFooterCount; @@ -141,10 +122,10 @@ import static java.lang.Integer.parseInt; import static java.lang.Math.max; import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; import static java.util.Collections.max; import static java.util.Objects.requireNonNull; import static java.util.concurrent.TimeUnit.MILLISECONDS; -import static org.apache.hadoop.fs.Path.getPathWithoutSchemeAndAuthority; public class BackgroundHiveSplitLoader implements HiveSplitLoader @@ -162,23 +143,18 @@ public class BackgroundHiveSplitLoader private static final ListenableFuture COMPLETED_FUTURE = immediateVoidFuture(); - private static final String FILE_INPUT_FORMAT_INPUT_DIR = "mapreduce.input.fileinputformat.inputdir"; - private final Table table; private final TupleDomain compactEffectivePredicate; private final DynamicFilter dynamicFilter; private final long dynamicFilteringWaitTimeoutMillis; private final TypeManager typeManager; private final Optional tableBucketInfo; - private final HdfsEnvironment hdfsEnvironment; - private final HdfsContext hdfsContext; private final HdfsNamenodeStats hdfsNamenodeStats; private final DirectoryLister directoryLister; private final TrinoFileSystemFactory fileSystemFactory; private final int loaderConcurrency; private final boolean recursiveDirWalkerEnabled; private final boolean ignoreAbsentPartitions; - private final boolean optimizeSymlinkListing; private final Executor executor; private final ConnectorSession session; private final ConcurrentLazyQueue partitions; @@ -220,14 +196,12 @@ public BackgroundHiveSplitLoader( Optional tableBucketInfo, ConnectorSession session, TrinoFileSystemFactory fileSystemFactory, - HdfsEnvironment hdfsEnvironment, HdfsNamenodeStats hdfsNamenodeStats, DirectoryLister directoryLister, Executor executor, int loaderConcurrency, boolean recursiveDirWalkerEnabled, boolean ignoreAbsentPartitions, - boolean optimizeSymlinkListing, Optional validWriteIds, Optional maxSplitFileSize, int maxPartitions) @@ -242,18 +216,15 @@ public BackgroundHiveSplitLoader( checkArgument(loaderConcurrency > 0, "loaderConcurrency must be > 0, found: %s", loaderConcurrency); this.session = session; this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); - this.hdfsEnvironment = hdfsEnvironment; this.hdfsNamenodeStats = hdfsNamenodeStats; this.directoryLister = directoryLister; this.recursiveDirWalkerEnabled = recursiveDirWalkerEnabled; this.ignoreAbsentPartitions = ignoreAbsentPartitions; - this.optimizeSymlinkListing = optimizeSymlinkListing; requireNonNull(executor, "executor is null"); // direct executor is not supported in this implementation due to locking specifics checkExecutorIsNotDirectExecutor(executor); this.executor = executor; this.partitions = new ConcurrentLazyQueue<>(partitions); - this.hdfsContext = new HdfsContext(session); this.validWriteIds = requireNonNull(validWriteIds, "validWriteIds is null"); this.maxSplitFileSize = requireNonNull(maxSplitFileSize, "maxSplitFileSize is null"); this.maxPartitions = maxPartitions; @@ -434,61 +405,37 @@ private ListenableFuture loadPartition(HivePartitionMetadata partition) return COMPLETED_FUTURE; } - Path path = new Path(getPartitionLocation(table, partition.getPartition())); - Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path); - FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path); - - boolean s3SelectPushdownEnabled = S3SelectPushdown.shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition()); - - // S3 Select pushdown works at the granularity of individual S3 objects for compressed files - // and finer granularity for uncompressed files using scan range feature. - boolean shouldEnableSplits = S3SelectPushdown.isSplittable(s3SelectPushdownEnabled, schema, path.toString()); + Location location = Location.of(getPartitionLocation(table, partition.getPartition())); // Skip header / footer lines are not splittable except for a special case when skip.header.line.count=1 - boolean splittable = shouldEnableSplits && getFooterCount(schema) == 0 && getHeaderCount(schema) <= 1; + boolean splittable = getFooterCount(schema) == 0 && getHeaderCount(schema) <= 1; if (SYMLINK_TEXT_INPUT_FORMAT_CLASS.equals(getInputFormatName(schema).orElse(null))) { if (tableBucketInfo.isPresent()) { throw new TrinoException(NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported"); } HiveStorageFormat targetStorageFormat = getSymlinkStorageFormat(getDeserializerClassName(schema)); - InputFormat targetInputFormat = getInputFormat(configuration, schema); - List targetPaths = hdfsEnvironment.doAs( - hdfsContext.getIdentity(), - () -> getTargetPathsFromSymlink(fs, path)); - Set parents = targetPaths.stream() - .map(Path::getParent) - .distinct() - .collect(toImmutableSet()); - if (optimizeSymlinkListing && parents.size() == 1 && !recursiveDirWalkerEnabled) { - Optional> manifestFileIterator = buildManifestFileIterator( - targetStorageFormat, - partitionName, - schema, - partitionKeys, - effectivePredicate, - partitionMatchSupplier, - s3SelectPushdownEnabled, - partition.getTableToPartitionMapping(), - getOnlyElement(parents), - targetPaths, - splittable); - if (manifestFileIterator.isPresent()) { - fileIterators.addLast(manifestFileIterator.get()); - return COMPLETED_FUTURE; - } - } - return createHiveSymlinkSplits( + ListMultimap targets = getTargetLocationsByParentFromSymlink(location); + + InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory( partitionName, targetStorageFormat, - targetInputFormat, schema, partitionKeys, effectivePredicate, partitionMatchSupplier, - s3SelectPushdownEnabled, partition.getTableToPartitionMapping(), - targetPaths); + Optional.empty(), + Optional.empty(), + getMaxInitialSplitSize(session), + isForceLocalScheduling(session), + maxSplitFileSize); + + for (Entry> entry : Multimaps.asMap(targets).entrySet()) { + fileIterators.addLast(buildManifestFileIterator(splitFactory, entry.getKey(), entry.getValue(), splittable)); + } + + return COMPLETED_FUTURE; } StorageFormat rawStorageFormat = partition.getPartition() @@ -522,7 +469,6 @@ private ListenableFuture loadPartition(HivePartitionMetadata partition) } InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory( - fs, partitionName, storageFormat, schema, @@ -534,23 +480,20 @@ private ListenableFuture loadPartition(HivePartitionMetadata partition) bucketValidation, getMaxInitialSplitSize(session), isForceLocalScheduling(session), - s3SelectPushdownEnabled, maxSplitFileSize); if (isTransactionalTable(table.getParameters())) { - return getTransactionalSplits(Location.of(path.toString()), splittable, bucketConversion, splitFactory); + return getTransactionalSplits(location, splittable, bucketConversion, splitFactory); } TrinoFileSystem trinoFileSystem = fileSystemFactory.create(session); - Location location = Location.of(path.toString()); // Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping if (tableBucketInfo.isPresent()) { List files = listBucketFiles(trinoFileSystem, location, splitFactory.getPartitionName()); return hiveSplitSource.addToQueue(getBucketedSplits(files, splitFactory, tableBucketInfo.get(), bucketConversion, splittable, Optional.empty())); } - Iterator splitIterator = createInternalHiveSplitIterator(trinoFileSystem, location, splitFactory, splittable, Optional.empty()); - fileIterators.addLast(splitIterator); + fileIterators.addLast(createInternalHiveSplitIterator(trinoFileSystem, location, splitFactory, splittable, Optional.empty())); return COMPLETED_FUTURE; } @@ -571,117 +514,23 @@ private List listBucketFiles(TrinoFileSystem fs, Location locat } } - private ListenableFuture createHiveSymlinkSplits( - String partitionName, - HiveStorageFormat storageFormat, - InputFormat targetInputFormat, - Properties schema, - List partitionKeys, - TupleDomain effectivePredicate, - BooleanSupplier partitionMatchSupplier, - boolean s3SelectPushdownEnabled, - TableToPartitionMapping tableToPartitionMapping, - List targetPaths) - throws IOException - { - ListenableFuture lastResult = COMPLETED_FUTURE; - for (Path targetPath : targetPaths) { - // the splits must be generated using the file system for the target path - // get the configuration for the target path -- it may be a different hdfs instance - FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, targetPath); - JobConf targetJob = toJobConf(targetFilesystem.getConf()); - targetJob.setInputFormat(TextInputFormat.class); - Optional principal = hdfsContext.getIdentity().getPrincipal(); - if (principal.isPresent()) { - targetJob.set(MRConfig.FRAMEWORK_NAME, MRConfig.CLASSIC_FRAMEWORK_NAME); - targetJob.set(MRConfig.MASTER_USER_NAME, principal.get().getName()); - } - if (targetInputFormat instanceof JobConfigurable) { - ((JobConfigurable) targetInputFormat).configure(targetJob); - } - targetJob.set(FILE_INPUT_FORMAT_INPUT_DIR, StringUtils.escapeString(targetPath.toString())); - InputSplit[] targetSplits = hdfsEnvironment.doAs( - hdfsContext.getIdentity(), - () -> targetInputFormat.getSplits(targetJob, 0)); - - InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory( - targetFilesystem, - partitionName, - storageFormat, - schema, - partitionKeys, - effectivePredicate, - partitionMatchSupplier, - tableToPartitionMapping, - Optional.empty(), - Optional.empty(), - getMaxInitialSplitSize(session), - isForceLocalScheduling(session), - s3SelectPushdownEnabled, - maxSplitFileSize); - lastResult = addSplitsToSource(targetSplits, splitFactory); - if (stopped) { - return COMPLETED_FUTURE; - } - } - return lastResult; - } - @VisibleForTesting - Optional> buildManifestFileIterator( - HiveStorageFormat targetStorageFormat, - String partitionName, - Properties schema, - List partitionKeys, - TupleDomain effectivePredicate, - BooleanSupplier partitionMatchSupplier, - boolean s3SelectPushdownEnabled, - TableToPartitionMapping tableToPartitionMapping, - Path parent, - List paths, - boolean splittable) - throws IOException + Iterator buildManifestFileIterator(InternalHiveSplitFactory splitFactory, Location location, List paths, boolean splittable) { - FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, parent); TrinoFileSystem trinoFileSystem = fileSystemFactory.create(session); - Location location = Location.of(parent.toString()); - Map fileStatuses = new HashMap<>(); - HiveFileIterator fileStatusIterator = new HiveFileIterator(table, location, trinoFileSystem, directoryLister, hdfsNamenodeStats, IGNORED); + Map fileStatuses = new HashMap<>(); + Iterator fileStatusIterator = new HiveFileIterator(table, location, trinoFileSystem, directoryLister, hdfsNamenodeStats, RECURSE); if (!fileStatusIterator.hasNext()) { checkPartitionLocationExists(trinoFileSystem, location); } - fileStatusIterator.forEachRemaining(status -> fileStatuses.put(getPathWithoutSchemeAndAuthority(new Path(status.getPath())), status)); - - List locatedFileStatuses = new ArrayList<>(); - for (Path path : paths) { - TrinoFileStatus status = fileStatuses.get(getPathWithoutSchemeAndAuthority(path)); - // This check will catch all directories in the manifest since HiveFileIterator will not return any directories. - // Some files may not be listed by HiveFileIterator - if those are included in the manifest this check will fail as well. - if (status == null) { - return Optional.empty(); - } + fileStatusIterator.forEachRemaining(status -> fileStatuses.put(Location.of(status.getPath()).path(), status)); - locatedFileStatuses.add(status); - } + List locatedFileStatuses = paths.stream() + .map(path -> fileStatuses.get(path.path())) + .toList(); - InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory( - targetFilesystem, - partitionName, - targetStorageFormat, - schema, - partitionKeys, - effectivePredicate, - partitionMatchSupplier, - tableToPartitionMapping, - Optional.empty(), - Optional.empty(), - getMaxInitialSplitSize(session), - isForceLocalScheduling(session), - s3SelectPushdownEnabled, - maxSplitFileSize); - - return Optional.of(createInternalHiveSplitIterator(splitFactory, splittable, Optional.empty(), locatedFileStatuses.stream())); + return createInternalHiveSplitIterator(splitFactory, splittable, Optional.empty(), locatedFileStatuses.stream()); } private ListenableFuture getTransactionalSplits(Location path, boolean splittable, Optional bucketConversion, InternalHiveSplitFactory splitFactory) @@ -785,22 +634,6 @@ private static Optional acidInfoForOriginalFiles(boolean fullAcid, Aci return fullAcid ? Optional.of(builder.buildWithRequiredOriginalFiles(getRequiredBucketNumber(location))) : Optional.empty(); } - private ListenableFuture addSplitsToSource(InputSplit[] targetSplits, InternalHiveSplitFactory splitFactory) - throws IOException - { - ListenableFuture lastResult = COMPLETED_FUTURE; - for (InputSplit inputSplit : targetSplits) { - Optional internalHiveSplit = splitFactory.createInternalHiveSplit((FileSplit) inputSplit); - if (internalHiveSplit.isPresent()) { - lastResult = hiveSplitSource.addToQueue(internalHiveSplit.get()); - } - if (stopped) { - return COMPLETED_FUTURE; - } - } - return lastResult; - } - private Iterator createInternalHiveSplitIterator(TrinoFileSystem fileSystem, Location location, InternalHiveSplitFactory splitFactory, boolean splittable, Optional acidInfo) { Iterator iterator = new HiveFileIterator(table, location, fileSystem, directoryLister, hdfsNamenodeStats, recursiveDirWalkerEnabled ? RECURSE : IGNORED); @@ -986,23 +819,28 @@ private static HiveStorageFormat getSymlinkStorageFormat(String serde) .orElseThrow(() -> new TrinoException(HIVE_UNSUPPORTED_FORMAT, "Unknown SerDe for SymlinkTextInputFormat: " + serde)); } - private static List getTargetPathsFromSymlink(FileSystem fileSystem, Path symlinkDir) + private ListMultimap getTargetLocationsByParentFromSymlink(Location symlinkDir) { + TrinoFileSystem fileSystem = fileSystemFactory.create(session); try { - FileStatus[] symlinks = fileSystem.listStatus(symlinkDir, path -> - !path.getName().startsWith("_") && !path.getName().startsWith(".")); - List targets = new ArrayList<>(); + ListMultimap targets = ArrayListMultimap.create(); + FileIterator iterator = fileSystem.listFiles(symlinkDir); + while (iterator.hasNext()) { + Location location = iterator.next().location(); + String name = location.fileName(); + if (name.startsWith("_") || name.startsWith(".")) { + continue; + } - for (FileStatus symlink : symlinks) { - try (BufferedReader reader = new BufferedReader(new InputStreamReader(fileSystem.open(symlink.getPath()), StandardCharsets.UTF_8))) { + try (Reader reader = new InputStreamReader(fileSystem.newInputFile(location).newStream(), UTF_8)) { CharStreams.readLines(reader).stream() - .map(Path::new) - .forEach(targets::add); + .map(Location::of) + .forEach(target -> targets.put(target.parentDirectory(), target)); } } return targets; } - catch (IOException e) { + catch (IOException | IllegalArgumentException e) { throw new TrinoException(HIVE_BAD_DATA, "Error parsing symlinks from: " + symlinkDir, e); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursor.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursor.java deleted file mode 100644 index e83297ba9d65a..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursor.java +++ /dev/null @@ -1,605 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; -import io.trino.hadoop.TextLineLengthLimitExceededException; -import io.trino.plugin.base.type.DecodedTimestamp; -import io.trino.plugin.base.type.TrinoTimestampEncoder; -import io.trino.spi.TrinoException; -import io.trino.spi.block.Block; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.type.CharType; -import io.trino.spi.type.DecimalType; -import io.trino.spi.type.Int128; -import io.trino.spi.type.LongTimestamp; -import io.trino.spi.type.TimestampType; -import io.trino.spi.type.Type; -import io.trino.spi.type.VarcharType; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.type.Date; -import org.apache.hadoop.hive.common.type.HiveChar; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.common.type.HiveVarchar; -import org.apache.hadoop.hive.common.type.Timestamp; -import org.apache.hadoop.hive.serde2.Deserializer; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.io.HiveCharWritable; -import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.io.BinaryComparable; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.RecordReader; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.math.BigInteger; -import java.util.Arrays; -import java.util.List; -import java.util.Properties; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkState; -import static io.trino.plugin.base.type.TrinoTimestampEncoderFactory.createTimestampEncoder; -import static io.trino.plugin.base.util.Closables.closeAllSuppress; -import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_CURSOR_ERROR; -import static io.trino.plugin.hive.util.HiveReaderUtil.getDeserializer; -import static io.trino.plugin.hive.util.HiveReaderUtil.getTableObjectInspector; -import static io.trino.plugin.hive.util.HiveUtil.isStructuralType; -import static io.trino.plugin.hive.util.SerDeUtils.getBlockObject; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.spi.type.BooleanType.BOOLEAN; -import static io.trino.spi.type.Chars.truncateToLengthAndTrimSpaces; -import static io.trino.spi.type.DateType.DATE; -import static io.trino.spi.type.Decimals.rescale; -import static io.trino.spi.type.DoubleType.DOUBLE; -import static io.trino.spi.type.IntegerType.INTEGER; -import static io.trino.spi.type.RealType.REAL; -import static io.trino.spi.type.SmallintType.SMALLINT; -import static io.trino.spi.type.TinyintType.TINYINT; -import static io.trino.spi.type.VarbinaryType.VARBINARY; -import static io.trino.spi.type.Varchars.truncateToLength; -import static java.lang.Float.floatToRawIntBits; -import static java.lang.Math.max; -import static java.lang.Math.min; -import static java.lang.String.format; -import static java.util.Objects.requireNonNull; -import static org.joda.time.DateTimeZone.UTC; - -public class GenericHiveRecordCursor - implements RecordCursor -{ - private final Path path; - private final RecordReader recordReader; - private final K key; - private final V value; - - private final Deserializer deserializer; - - private final Type[] types; - private final HiveType[] hiveTypes; - - private final StructObjectInspector rowInspector; - private final ObjectInspector[] fieldInspectors; - private final StructField[] structFields; - - private final boolean[] loaded; - private final boolean[] booleans; - private final long[] longs; - private final double[] doubles; - private final Slice[] slices; - private final Object[] objects; - private final boolean[] nulls; - private final TrinoTimestampEncoder[] timestampEncoders; - - private final long totalBytes; - - private long completedBytes; - private Object rowData; - private boolean closed; - - public GenericHiveRecordCursor( - Configuration configuration, - Path path, - RecordReader recordReader, - long totalBytes, - Properties splitSchema, - List columns) - { - requireNonNull(path, "path is null"); - requireNonNull(recordReader, "recordReader is null"); - checkArgument(totalBytes >= 0, "totalBytes is negative"); - requireNonNull(splitSchema, "splitSchema is null"); - requireNonNull(columns, "columns is null"); - - this.path = path; - this.recordReader = recordReader; - this.totalBytes = totalBytes; - this.key = recordReader.createKey(); - this.value = recordReader.createValue(); - - this.deserializer = getDeserializer(configuration, splitSchema); - this.rowInspector = getTableObjectInspector(deserializer); - - int size = columns.size(); - - this.types = new Type[size]; - this.hiveTypes = new HiveType[size]; - - this.structFields = new StructField[size]; - this.fieldInspectors = new ObjectInspector[size]; - - this.loaded = new boolean[size]; - this.booleans = new boolean[size]; - this.longs = new long[size]; - this.doubles = new double[size]; - this.slices = new Slice[size]; - this.objects = new Object[size]; - this.nulls = new boolean[size]; - this.timestampEncoders = new TrinoTimestampEncoder[size]; - - // initialize data columns - for (int i = 0; i < columns.size(); i++) { - HiveColumnHandle column = columns.get(i); - checkState(column.getColumnType() == REGULAR, "column type must be regular"); - - Type columnType = column.getType(); - types[i] = columnType; - if (columnType instanceof TimestampType) { - timestampEncoders[i] = createTimestampEncoder((TimestampType) columnType, UTC); - } - hiveTypes[i] = column.getHiveType(); - - StructField field = rowInspector.getStructFieldRef(column.getName()); - structFields[i] = field; - fieldInspectors[i] = field.getFieldObjectInspector(); - } - } - - @Override - public long getCompletedBytes() - { - if (!closed) { - updateCompletedBytes(); - } - return completedBytes; - } - - @Override - public long getReadTimeNanos() - { - return 0; - } - - private void updateCompletedBytes() - { - try { - @SuppressWarnings("NumericCastThatLosesPrecision") - long newCompletedBytes = (long) (totalBytes * recordReader.getProgress()); - completedBytes = min(totalBytes, max(completedBytes, newCompletedBytes)); - } - catch (IOException ignored) { - } - } - - @Override - public Type getType(int field) - { - return types[field]; - } - - @Override - public boolean advanceNextPosition() - { - try { - if (closed || !recordReader.next(key, value)) { - close(); - return false; - } - - // Only deserialize the value if atleast one column is required - if (types.length > 0) { - // reset loaded flags - Arrays.fill(loaded, false); - - // decode value - rowData = deserializer.deserialize(value); - } - - return true; - } - catch (IOException | SerDeException | RuntimeException e) { - closeAllSuppress(e, this); - if (e instanceof TextLineLengthLimitExceededException) { - throw new TrinoException(HIVE_BAD_DATA, "Line too long in text file: " + path, e); - } - throw new TrinoException(HIVE_CURSOR_ERROR, e); - } - } - - @Override - public boolean getBoolean(int fieldId) - { - checkState(!closed, "Cursor is closed"); - - validateType(fieldId, boolean.class); - if (!loaded[fieldId]) { - parseBooleanColumn(fieldId); - } - return booleans[fieldId]; - } - - private void parseBooleanColumn(int column) - { - loaded[column] = true; - - Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); - - if (fieldData == null) { - nulls[column] = true; - } - else { - Object fieldValue = ((PrimitiveObjectInspector) fieldInspectors[column]).getPrimitiveJavaObject(fieldData); - checkState(fieldValue != null, "fieldValue should not be null"); - booleans[column] = (Boolean) fieldValue; - nulls[column] = false; - } - } - - @Override - public long getLong(int fieldId) - { - checkState(!closed, "Cursor is closed"); - - validateType(fieldId, long.class); - if (!loaded[fieldId]) { - parseLongColumn(fieldId); - } - return longs[fieldId]; - } - - private void parseLongColumn(int column) - { - loaded[column] = true; - - Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); - - if (fieldData == null) { - nulls[column] = true; - } - else { - Object fieldValue = ((PrimitiveObjectInspector) fieldInspectors[column]).getPrimitiveJavaObject(fieldData); - checkState(fieldValue != null, "fieldValue should not be null"); - longs[column] = getLongExpressedValue(fieldValue, column); - nulls[column] = false; - } - } - - private long getLongExpressedValue(Object value, int column) - { - if (value instanceof Date) { - return ((Date) value).toEpochDay(); - } - if (value instanceof Timestamp) { - return shortTimestamp((Timestamp) value, column); - } - if (value instanceof Float) { - return floatToRawIntBits(((Float) value)); - } - return ((Number) value).longValue(); - } - - @Override - public double getDouble(int fieldId) - { - checkState(!closed, "Cursor is closed"); - - validateType(fieldId, double.class); - if (!loaded[fieldId]) { - parseDoubleColumn(fieldId); - } - return doubles[fieldId]; - } - - private void parseDoubleColumn(int column) - { - loaded[column] = true; - - Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); - - if (fieldData == null) { - nulls[column] = true; - } - else { - Object fieldValue = ((PrimitiveObjectInspector) fieldInspectors[column]).getPrimitiveJavaObject(fieldData); - checkState(fieldValue != null, "fieldValue should not be null"); - doubles[column] = ((Number) fieldValue).doubleValue(); - nulls[column] = false; - } - } - - @Override - public Slice getSlice(int fieldId) - { - checkState(!closed, "Cursor is closed"); - - validateType(fieldId, Slice.class); - if (!loaded[fieldId]) { - parseStringColumn(fieldId); - } - return slices[fieldId]; - } - - private void parseStringColumn(int column) - { - loaded[column] = true; - - Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); - - if (fieldData == null) { - nulls[column] = true; - } - else { - PrimitiveObjectInspector inspector = (PrimitiveObjectInspector) fieldInspectors[column]; - Slice value; - if (inspector.preferWritable()) { - value = parseStringFromPrimitiveWritableObjectValue(types[column], inspector.getPrimitiveWritableObject(fieldData)); - } - else { - value = parseStringFromPrimitiveJavaObjectValue(types[column], inspector.getPrimitiveJavaObject(fieldData)); - } - slices[column] = value; - nulls[column] = false; - } - } - - private static Slice trimStringToCharacterLimits(Type type, Slice value) - { - if (type instanceof VarcharType) { - return truncateToLength(value, type); - } - if (type instanceof CharType) { - return truncateToLengthAndTrimSpaces(value, type); - } - return value; - } - - private static Slice parseStringFromPrimitiveWritableObjectValue(Type type, Object fieldValue) - { - checkState(fieldValue != null, "fieldValue should not be null"); - BinaryComparable hiveValue; - if (fieldValue instanceof Text) { - hiveValue = (Text) fieldValue; - } - else if (fieldValue instanceof BytesWritable) { - hiveValue = (BytesWritable) fieldValue; - } - else if (fieldValue instanceof HiveVarcharWritable) { - hiveValue = ((HiveVarcharWritable) fieldValue).getTextValue(); - } - else if (fieldValue instanceof HiveCharWritable) { - hiveValue = ((HiveCharWritable) fieldValue).getTextValue(); - } - else { - throw new IllegalStateException("unsupported string field type: " + fieldValue.getClass().getName()); - } - // create a slice view over the hive value and trim to character limits - Slice value = trimStringToCharacterLimits(type, Slices.wrappedBuffer(hiveValue.getBytes(), 0, hiveValue.getLength())); - // store a copy of the bytes, since the hive reader can reuse the underlying buffer - return value.copy(); - } - - private static Slice parseStringFromPrimitiveJavaObjectValue(Type type, Object fieldValue) - { - checkState(fieldValue != null, "fieldValue should not be null"); - Slice value; - if (fieldValue instanceof String) { - value = Slices.utf8Slice((String) fieldValue); - } - else if (fieldValue instanceof byte[]) { - value = Slices.wrappedBuffer((byte[]) fieldValue); - } - else if (fieldValue instanceof HiveVarchar) { - value = Slices.utf8Slice(((HiveVarchar) fieldValue).getValue()); - } - else if (fieldValue instanceof HiveChar) { - value = Slices.utf8Slice(((HiveChar) fieldValue).getValue()); - } - else { - throw new IllegalStateException("unsupported string field type: " + fieldValue.getClass().getName()); - } - value = trimStringToCharacterLimits(type, value); - // Copy the slice if the value was trimmed and is now smaller than the backing buffer - if (!value.isCompact()) { - return value.copy(); - } - return value; - } - - private void parseDecimalColumn(int column) - { - loaded[column] = true; - - Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); - - if (fieldData == null) { - nulls[column] = true; - } - else { - Object fieldValue = ((PrimitiveObjectInspector) fieldInspectors[column]).getPrimitiveJavaObject(fieldData); - checkState(fieldValue != null, "fieldValue should not be null"); - - HiveDecimal decimal = (HiveDecimal) fieldValue; - DecimalType columnType = (DecimalType) types[column]; - BigInteger unscaledDecimal = rescale(decimal.unscaledValue(), decimal.scale(), columnType.getScale()); - - if (columnType.isShort()) { - longs[column] = unscaledDecimal.longValue(); - } - else { - objects[column] = Int128.valueOf(unscaledDecimal); - } - nulls[column] = false; - } - } - - @Override - public Object getObject(int fieldId) - { - checkState(!closed, "Cursor is closed"); - - if (!loaded[fieldId]) { - parseObjectColumn(fieldId); - } - return objects[fieldId]; - } - - private void parseObjectColumn(int column) - { - loaded[column] = true; - - Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); - - if (fieldData == null) { - nulls[column] = true; - } - else { - Type type = types[column]; - if (type.getJavaType() == Block.class) { - objects[column] = getBlockObject(type, fieldData, fieldInspectors[column]); - } - else if (type instanceof TimestampType) { - Timestamp timestamp = (Timestamp) ((PrimitiveObjectInspector) fieldInspectors[column]).getPrimitiveJavaObject(fieldData); - objects[column] = longTimestamp(timestamp, column); - } - else { - throw new IllegalStateException("Unsupported type: " + type); - } - nulls[column] = false; - } - } - - @Override - public boolean isNull(int fieldId) - { - checkState(!closed, "Cursor is closed"); - - if (!loaded[fieldId]) { - parseColumn(fieldId); - } - return nulls[fieldId]; - } - - private void parseColumn(int column) - { - Type type = types[column]; - if (BOOLEAN.equals(type)) { - parseBooleanColumn(column); - } - else if (BIGINT.equals(type)) { - parseLongColumn(column); - } - else if (INTEGER.equals(type)) { - parseLongColumn(column); - } - else if (SMALLINT.equals(type)) { - parseLongColumn(column); - } - else if (TINYINT.equals(type)) { - parseLongColumn(column); - } - else if (REAL.equals(type)) { - parseLongColumn(column); - } - else if (DOUBLE.equals(type)) { - parseDoubleColumn(column); - } - else if (type instanceof VarcharType || VARBINARY.equals(type)) { - parseStringColumn(column); - } - else if (type instanceof CharType) { - parseStringColumn(column); - } - else if (isStructuralType(type)) { - parseObjectColumn(column); - } - else if (DATE.equals(type)) { - parseLongColumn(column); - } - else if (type instanceof TimestampType) { - if (((TimestampType) type).isShort()) { - parseLongColumn(column); - } - else { - parseObjectColumn(column); - } - } - else if (type instanceof DecimalType) { - parseDecimalColumn(column); - } - else { - throw new UnsupportedOperationException("Unsupported column type: " + type); - } - } - - private void validateType(int fieldId, Class type) - { - if (!types[fieldId].getJavaType().equals(type)) { - // we don't use Preconditions.checkArgument because it requires boxing fieldId, which affects inner loop performance - throw new IllegalArgumentException(format("Expected field to be %s, actual %s (field %s)", type, types[fieldId], fieldId)); - } - } - - @Override - public void close() - { - // some hive input formats are broken and bad things can happen if you close them multiple times - if (closed) { - return; - } - closed = true; - - updateCompletedBytes(); - - try { - recordReader.close(); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - private long shortTimestamp(Timestamp value, int column) - { - @SuppressWarnings("unchecked") - TrinoTimestampEncoder encoder = (TrinoTimestampEncoder) timestampEncoders[column]; - return encoder.getTimestamp(new DecodedTimestamp(value.toEpochSecond(), value.getNanos())); - } - - private LongTimestamp longTimestamp(Timestamp value, int column) - { - @SuppressWarnings("unchecked") - TrinoTimestampEncoder encoder = (TrinoTimestampEncoder) timestampEncoders[column]; - return encoder.getTimestamp(new DecodedTimestamp(value.toEpochSecond(), value.getNanos())); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursorProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursorProvider.java deleted file mode 100644 index e809bb83278a7..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursorProvider.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import com.google.inject.Inject; -import io.airlift.units.DataSize; -import io.trino.filesystem.Location; -import io.trino.hdfs.HdfsEnvironment; -import io.trino.spi.TrinoException; -import io.trino.spi.connector.ConnectorSession; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.predicate.TupleDomain; -import io.trino.spi.type.TypeManager; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; - -import java.io.IOException; -import java.util.List; -import java.util.Optional; -import java.util.Properties; - -import static com.google.common.base.Preconditions.checkArgument; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; -import static io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns; -import static io.trino.plugin.hive.util.HiveReaderUtil.createRecordReader; -import static java.lang.Math.toIntExact; -import static java.util.Objects.requireNonNull; -import static java.util.stream.Collectors.toUnmodifiableList; - -public class GenericHiveRecordCursorProvider - implements HiveRecordCursorProvider -{ - private final HdfsEnvironment hdfsEnvironment; - private final int textMaxLineLengthBytes; - - @Inject - public GenericHiveRecordCursorProvider(HdfsEnvironment hdfsEnvironment, HiveConfig config) - { - this(hdfsEnvironment, config.getTextMaxLineLength()); - } - - public GenericHiveRecordCursorProvider(HdfsEnvironment hdfsEnvironment, DataSize textMaxLineLength) - { - this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); - this.textMaxLineLengthBytes = toIntExact(textMaxLineLength.toBytes()); - checkArgument(textMaxLineLengthBytes >= 1, "textMaxLineLength must be at least 1 byte"); - } - - @Override - public Optional createRecordCursor( - Configuration configuration, - ConnectorSession session, - Location location, - long start, - long length, - long fileSize, - Properties schema, - List columns, - TupleDomain effectivePredicate, - TypeManager typeManager, - boolean s3SelectPushdownEnabled) - { - configuration.setInt(LineRecordReader.MAX_LINE_LENGTH, textMaxLineLengthBytes); - - // make sure the FileSystem is created with the proper Configuration object - Path path = new Path(location.toString()); - try { - this.hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration); - } - catch (IOException e) { - throw new TrinoException(HIVE_FILESYSTEM_ERROR, "Failed getting FileSystem: " + path, e); - } - - Optional projections = projectBaseColumns(columns); - List readerColumns = projections - .map(ReaderColumns::get) - .map(columnHandles -> columnHandles.stream() - .map(HiveColumnHandle.class::cast) - .collect(toUnmodifiableList())) - .orElse(columns); - - RecordCursor cursor = hdfsEnvironment.doAs(session.getIdentity(), () -> { - RecordReader recordReader = createRecordReader( - configuration, - path, - start, - length, - schema, - readerColumns); - - try { - return new GenericHiveRecordCursor<>( - configuration, - path, - genericRecordReader(recordReader), - length, - schema, - readerColumns); - } - catch (Exception e) { - try { - recordReader.close(); - } - catch (IOException closeException) { - if (e != closeException) { - e.addSuppressed(closeException); - } - } - throw e; - } - }); - - return Optional.of(new ReaderRecordCursorWithProjections(cursor, projections)); - } - - @SuppressWarnings("unchecked") - private static RecordReader genericRecordReader(RecordReader recordReader) - { - return (RecordReader) recordReader; - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveBucketAdapterRecordCursor.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveBucketAdapterRecordCursor.java deleted file mode 100644 index c86ba7d82c4a5..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveBucketAdapterRecordCursor.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import io.airlift.slice.Slice; -import io.trino.plugin.hive.type.TypeInfo; -import io.trino.plugin.hive.util.ForwardingRecordCursor; -import io.trino.plugin.hive.util.HiveBucketing; -import io.trino.plugin.hive.util.HiveBucketing.BucketingVersion; -import io.trino.spi.TrinoException; -import io.trino.spi.block.Block; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.type.Type; -import io.trino.spi.type.TypeManager; - -import java.util.List; - -import static com.google.common.collect.ImmutableList.toImmutableList; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_BUCKET_FILES; -import static io.trino.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; -import static java.lang.String.format; -import static java.util.Objects.requireNonNull; - -public class HiveBucketAdapterRecordCursor - extends ForwardingRecordCursor -{ - private final RecordCursor delegate; - private final int[] bucketColumnIndices; - private final List> javaTypeList; - private final List typeInfoList; - private final BucketingVersion bucketingVersion; - private final int tableBucketCount; - private final int partitionBucketCount; - private final int bucketToKeep; - - private final Object[] scratch; - - public HiveBucketAdapterRecordCursor( - int[] bucketColumnIndices, - List bucketColumnHiveTypes, - BucketingVersion bucketingVersion, - int tableBucketCount, - int partitionBucketCount, - int bucketToKeep, - TypeManager typeManager, - RecordCursor delegate) - { - this.bucketColumnIndices = requireNonNull(bucketColumnIndices, "bucketColumnIndices is null"); - this.delegate = requireNonNull(delegate, "delegate is null"); - requireNonNull(bucketColumnHiveTypes, "bucketColumnHiveTypes is null"); - this.javaTypeList = bucketColumnHiveTypes.stream() - .map(HiveType::getTypeSignature) - .map(typeManager::getType) - .map(Type::getJavaType) - .collect(toImmutableList()); - this.typeInfoList = bucketColumnHiveTypes.stream() - .map(HiveType::getTypeInfo) - .collect(toImmutableList()); - this.bucketingVersion = requireNonNull(bucketingVersion, "bucketingVersion is null"); - this.tableBucketCount = tableBucketCount; - this.partitionBucketCount = partitionBucketCount; - this.bucketToKeep = bucketToKeep; - - this.scratch = new Object[bucketColumnHiveTypes.size()]; - } - - @Override - protected RecordCursor delegate() - { - return delegate; - } - - @Override - public boolean advanceNextPosition() - { - while (true) { - if (Thread.interrupted()) { - // Stop processing if the query has been destroyed. - Thread.currentThread().interrupt(); - throw new TrinoException(GENERIC_INTERNAL_ERROR, "RecordCursor was interrupted"); - } - - boolean hasNextPosition = delegate.advanceNextPosition(); - if (!hasNextPosition) { - return false; - } - for (int i = 0; i < scratch.length; i++) { - int index = bucketColumnIndices[i]; - if (delegate.isNull(index)) { - scratch[i] = null; - continue; - } - Class javaType = javaTypeList.get(i); - if (javaType == boolean.class) { - scratch[i] = delegate.getBoolean(index); - } - else if (javaType == long.class) { - scratch[i] = delegate.getLong(index); - } - else if (javaType == double.class) { - scratch[i] = delegate.getDouble(index); - } - else if (javaType == Slice.class) { - scratch[i] = delegate.getSlice(index); - } - else if (javaType == Block.class) { - scratch[i] = delegate.getObject(index); - } - else { - throw new UnsupportedOperationException("Unknown java type: " + javaType); - } - } - int bucket = HiveBucketing.getHiveBucket(bucketingVersion, tableBucketCount, typeInfoList, scratch); - if ((bucket - bucketToKeep) % partitionBucketCount != 0) { - throw new TrinoException(HIVE_INVALID_BUCKET_FILES, format( - "A row that is supposed to be in bucket %s is encountered. Only rows in bucket %s (modulo %s) are expected", - bucket, bucketToKeep % partitionBucketCount, partitionBucketCount)); - } - if (bucket == bucketToKeep) { - return true; - } - } - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveBucketValidationRecordCursor.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveBucketValidationRecordCursor.java deleted file mode 100644 index cb4a07b238a7b..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveBucketValidationRecordCursor.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.VerifyException; -import io.airlift.slice.Slice; -import io.trino.filesystem.Location; -import io.trino.plugin.hive.type.TypeInfo; -import io.trino.plugin.hive.util.ForwardingRecordCursor; -import io.trino.plugin.hive.util.HiveBucketing.BucketingVersion; -import io.trino.spi.TrinoException; -import io.trino.spi.block.Block; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.type.Type; -import io.trino.spi.type.TypeManager; - -import java.util.List; - -import static com.google.common.collect.ImmutableList.toImmutableList; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_BUCKET_FILES; -import static io.trino.plugin.hive.HivePageSource.BucketValidator.VALIDATION_STRIDE; -import static io.trino.plugin.hive.util.HiveBucketing.getHiveBucket; -import static java.lang.String.format; -import static java.util.Objects.requireNonNull; - -public class HiveBucketValidationRecordCursor - extends ForwardingRecordCursor -{ - private final RecordCursor delegate; - private final Location path; - private final int[] bucketColumnIndices; - private final List> javaTypeList; - private final List typeInfoList; - private final BucketingVersion bucketingVersion; - private final int bucketCount; - private final int expectedBucket; - - private final Object[] scratch; - - private int validationCounter; - - public HiveBucketValidationRecordCursor( - Location path, - int[] bucketColumnIndices, - List bucketColumnTypes, - BucketingVersion bucketingVersion, - int bucketCount, - int expectedBucket, - TypeManager typeManager, - RecordCursor delegate) - { - this.path = requireNonNull(path, "path is null"); - this.bucketColumnIndices = requireNonNull(bucketColumnIndices, "bucketColumnIndices is null"); - requireNonNull(bucketColumnTypes, "bucketColumnTypes is null"); - this.javaTypeList = bucketColumnTypes.stream() - .map(HiveType::getTypeSignature) - .map(typeManager::getType) - .map(Type::getJavaType) - .collect(toImmutableList()); - this.typeInfoList = bucketColumnTypes.stream() - .map(HiveType::getTypeInfo) - .collect(toImmutableList()); - this.bucketingVersion = requireNonNull(bucketingVersion, "bucketingVersion is null"); - this.bucketCount = bucketCount; - this.expectedBucket = expectedBucket; - this.delegate = requireNonNull(delegate, "delegate is null"); - - this.scratch = new Object[bucketColumnTypes.size()]; - } - - @VisibleForTesting - @Override - public RecordCursor delegate() - { - return delegate; - } - - @Override - public boolean advanceNextPosition() - { - if (!delegate.advanceNextPosition()) { - return false; - } - - if (validationCounter > 0) { - validationCounter--; - return true; - } - validationCounter = VALIDATION_STRIDE - 1; - - for (int i = 0; i < scratch.length; i++) { - int index = bucketColumnIndices[i]; - if (delegate.isNull(index)) { - scratch[i] = null; - continue; - } - Class javaType = javaTypeList.get(i); - if (javaType == boolean.class) { - scratch[i] = delegate.getBoolean(index); - } - else if (javaType == long.class) { - scratch[i] = delegate.getLong(index); - } - else if (javaType == double.class) { - scratch[i] = delegate.getDouble(index); - } - else if (javaType == Slice.class) { - scratch[i] = delegate.getSlice(index); - } - else if (javaType == Block.class) { - scratch[i] = delegate.getObject(index); - } - else { - throw new VerifyException("Unknown Java type: " + javaType); - } - } - - int bucket = getHiveBucket(bucketingVersion, bucketCount, typeInfoList, scratch); - if (bucket != expectedBucket) { - throw new TrinoException(HIVE_INVALID_BUCKET_FILES, - format("Hive table is corrupt. File '%s' is for bucket %s, but contains a row for bucket %s.", path, expectedBucket, bucket)); - } - - return true; - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveConfig.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveConfig.java index ab85c5c7b8fbc..02dbd79c67535 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveConfig.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveConfig.java @@ -58,6 +58,10 @@ "hive.assume-canonical-partition-keys", "hive.partition-use-column-names", "hive.allow-corrupt-writes-for-testing", + "hive.optimize-symlink-listing", + "hive.s3select-pushdown.enabled", + "hive.s3select-pushdown.experimental-textfile-pushdown-enabled", + "hive.s3select-pushdown.max-connections", }) public class HiveConfig { @@ -133,10 +137,6 @@ public class HiveConfig private boolean ignoreCorruptedStatistics; private boolean collectColumnStatisticsOnWrite = true; - private boolean s3SelectPushdownEnabled; - private boolean s3SelectExperimentalPushdownEnabled; - private int s3SelectPushdownMaxConnections = 500; - private boolean isTemporaryStagingDirectoryEnabled = true; private String temporaryStagingDirectoryPath = "/tmp/presto-${USER}"; private boolean delegateTransactionalManagedTableLocationToMetastore; @@ -163,8 +163,6 @@ public class HiveConfig private HiveTimestampPrecision timestampPrecision = HiveTimestampPrecision.DEFAULT_PRECISION; - private boolean optimizeSymlinkListing = true; - private Optional icebergCatalogName = Optional.empty(); private Optional deltaLakeCatalogName = Optional.empty(); private Optional hudiCatalogName = Optional.empty(); @@ -1001,45 +999,6 @@ public HiveConfig setCollectColumnStatisticsOnWrite(boolean collectColumnStatist return this; } - public boolean isS3SelectPushdownEnabled() - { - return s3SelectPushdownEnabled; - } - - @Config("hive.s3select-pushdown.enabled") - @ConfigDescription("Enable query pushdown to JSON files using the AWS S3 Select service") - public HiveConfig setS3SelectPushdownEnabled(boolean s3SelectPushdownEnabled) - { - this.s3SelectPushdownEnabled = s3SelectPushdownEnabled; - return this; - } - - public boolean isS3SelectExperimentalPushdownEnabled() - { - return s3SelectExperimentalPushdownEnabled; - } - - @Config("hive.s3select-pushdown.experimental-textfile-pushdown-enabled") - @ConfigDescription("Enable query pushdown to TEXTFILE tables using the AWS S3 Select service") - public HiveConfig setS3SelectExperimentalPushdownEnabled(boolean s3SelectExperimentalPushdownEnabled) - { - this.s3SelectExperimentalPushdownEnabled = s3SelectExperimentalPushdownEnabled; - return this; - } - - @Min(1) - public int getS3SelectPushdownMaxConnections() - { - return s3SelectPushdownMaxConnections; - } - - @Config("hive.s3select-pushdown.max-connections") - public HiveConfig setS3SelectPushdownMaxConnections(int s3SelectPushdownMaxConnections) - { - this.s3SelectPushdownMaxConnections = s3SelectPushdownMaxConnections; - return this; - } - @Config("hive.temporary-staging-directory-enabled") @ConfigDescription("Should use (if possible) temporary staging directory for write operations") public HiveConfig setTemporaryStagingDirectoryEnabled(boolean temporaryStagingDirectoryEnabled) @@ -1186,19 +1145,6 @@ public HiveConfig setTimestampPrecision(HiveTimestampPrecision timestampPrecisio return this; } - public boolean isOptimizeSymlinkListing() - { - return this.optimizeSymlinkListing; - } - - @Config("hive.optimize-symlink-listing") - @ConfigDescription("Optimize listing for SymlinkTextFormat tables with files in a single directory") - public HiveConfig setOptimizeSymlinkListing(boolean optimizeSymlinkListing) - { - this.optimizeSymlinkListing = optimizeSymlinkListing; - return this; - } - public Optional getIcebergCatalogName() { return icebergCatalogName; diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveFormatsConfig.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveFormatsConfig.java deleted file mode 100644 index 43b343a80931f..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveFormatsConfig.java +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import io.airlift.configuration.Config; -import io.airlift.configuration.ConfigDescription; - -public class HiveFormatsConfig -{ - private boolean avroFileNativeReaderEnabled = true; - private boolean avroFileNativeWriterEnabled = true; - private boolean csvNativeReaderEnabled = true; - private boolean csvNativeWriterEnabled = true; - private boolean jsonNativeReaderEnabled = true; - private boolean jsonNativeWriterEnabled = true; - private boolean openXJsonNativeReaderEnabled = true; - private boolean openXJsonNativeWriterEnabled = true; - private boolean regexNativeReaderEnabled = true; - private boolean textFileNativeReaderEnabled = true; - private boolean textFileNativeWriterEnabled = true; - private boolean sequenceFileNativeReaderEnabled = true; - private boolean sequenceFileNativeWriterEnabled = true; - - public boolean isAvroFileNativeReaderEnabled() - { - return avroFileNativeReaderEnabled; - } - - @Config("avro.native-reader.enabled") - @ConfigDescription("Use native Avro file reader") - public HiveFormatsConfig setAvroFileNativeReaderEnabled(boolean avroFileNativeReaderEnabled) - { - this.avroFileNativeReaderEnabled = avroFileNativeReaderEnabled; - return this; - } - - public boolean isAvroFileNativeWriterEnabled() - { - return avroFileNativeWriterEnabled; - } - - @Config("avro.native-writer.enabled") - @ConfigDescription("Use native Avro file writer") - public HiveFormatsConfig setAvroFileNativeWriterEnabled(boolean avroFileNativeWriterEnabled) - { - this.avroFileNativeWriterEnabled = avroFileNativeWriterEnabled; - return this; - } - - public boolean isCsvNativeReaderEnabled() - { - return csvNativeReaderEnabled; - } - - @Config("csv.native-reader.enabled") - @ConfigDescription("Use native CSV reader") - public HiveFormatsConfig setCsvNativeReaderEnabled(boolean csvNativeReaderEnabled) - { - this.csvNativeReaderEnabled = csvNativeReaderEnabled; - return this; - } - - public boolean isCsvNativeWriterEnabled() - { - return csvNativeWriterEnabled; - } - - @Config("csv.native-writer.enabled") - @ConfigDescription("Use native CSV writer") - public HiveFormatsConfig setCsvNativeWriterEnabled(boolean csvNativeWriterEnabled) - { - this.csvNativeWriterEnabled = csvNativeWriterEnabled; - return this; - } - - public boolean isJsonNativeReaderEnabled() - { - return jsonNativeReaderEnabled; - } - - @Config("json.native-reader.enabled") - @ConfigDescription("Use native JSON reader") - public HiveFormatsConfig setJsonNativeReaderEnabled(boolean jsonNativeReaderEnabled) - { - this.jsonNativeReaderEnabled = jsonNativeReaderEnabled; - return this; - } - - public boolean isJsonNativeWriterEnabled() - { - return jsonNativeWriterEnabled; - } - - @Config("json.native-writer.enabled") - @ConfigDescription("Use native JSON writer") - public HiveFormatsConfig setJsonNativeWriterEnabled(boolean jsonNativeWriterEnabled) - { - this.jsonNativeWriterEnabled = jsonNativeWriterEnabled; - return this; - } - - public boolean isOpenXJsonNativeReaderEnabled() - { - return openXJsonNativeReaderEnabled; - } - - @Config("openx-json.native-reader.enabled") - @ConfigDescription("Use native OpenXJson reader") - public HiveFormatsConfig setOpenXJsonNativeReaderEnabled(boolean openXJsonNativeReaderEnabled) - { - this.openXJsonNativeReaderEnabled = openXJsonNativeReaderEnabled; - return this; - } - - public boolean isOpenXJsonNativeWriterEnabled() - { - return openXJsonNativeWriterEnabled; - } - - @Config("openx-json.native-writer.enabled") - @ConfigDescription("Use native OpenXJson writer") - public HiveFormatsConfig setOpenXJsonNativeWriterEnabled(boolean openXJsonNativeWriterEnabled) - { - this.openXJsonNativeWriterEnabled = openXJsonNativeWriterEnabled; - return this; - } - - public boolean isRegexNativeReaderEnabled() - { - return regexNativeReaderEnabled; - } - - @Config("regex.native-reader.enabled") - @ConfigDescription("Use native REGEX reader") - public HiveFormatsConfig setRegexNativeReaderEnabled(boolean regexNativeReaderEnabled) - { - this.regexNativeReaderEnabled = regexNativeReaderEnabled; - return this; - } - - public boolean isTextFileNativeReaderEnabled() - { - return textFileNativeReaderEnabled; - } - - @Config("text-file.native-reader.enabled") - @ConfigDescription("Use native text file reader") - public HiveFormatsConfig setTextFileNativeReaderEnabled(boolean textFileNativeReaderEnabled) - { - this.textFileNativeReaderEnabled = textFileNativeReaderEnabled; - return this; - } - - public boolean isTextFileNativeWriterEnabled() - { - return textFileNativeWriterEnabled; - } - - @Config("text-file.native-writer.enabled") - @ConfigDescription("Use native text file writer") - public HiveFormatsConfig setTextFileNativeWriterEnabled(boolean textFileNativeWriterEnabled) - { - this.textFileNativeWriterEnabled = textFileNativeWriterEnabled; - return this; - } - - public boolean isSequenceFileNativeReaderEnabled() - { - return sequenceFileNativeReaderEnabled; - } - - @Config("sequence-file.native-reader.enabled") - @ConfigDescription("Use native sequence file reader") - public HiveFormatsConfig setSequenceFileNativeReaderEnabled(boolean sequenceFileNativeReaderEnabled) - { - this.sequenceFileNativeReaderEnabled = sequenceFileNativeReaderEnabled; - return this; - } - - public boolean isSequenceFileNativeWriterEnabled() - { - return sequenceFileNativeWriterEnabled; - } - - @Config("sequence-file.native-writer.enabled") - @ConfigDescription("Use native sequence file writer") - public HiveFormatsConfig setSequenceFileNativeWriterEnabled(boolean sequenceFileNativeWriterEnabled) - { - this.sequenceFileNativeWriterEnabled = sequenceFileNativeWriterEnabled; - return this; - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java index d628d4e820704..f102acc779a77 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveMetadata.java @@ -159,7 +159,6 @@ import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Iterables.concat; import static com.google.common.collect.Iterables.getOnlyElement; -import static com.google.common.reflect.Reflection.newProxy; import static io.trino.plugin.base.projection.ApplyProjectionUtil.extractSupportedProjectedColumns; import static io.trino.plugin.base.projection.ApplyProjectionUtil.replaceWithNewVariables; import static io.trino.plugin.hive.HiveAnalyzeProperties.getColumnNames; @@ -1941,7 +1940,7 @@ private void createEmptyFiles(ConnectorSession session, Location path, Table tab format, HiveCompressionCodec.NONE, schema, - nativeWriterAlwaysEnabled(session), + session, OptionalInt.empty(), NO_ACID_TRANSACTION, false, @@ -1953,16 +1952,6 @@ private void createEmptyFiles(ConnectorSession session, Location path, Table tab } } - private static ConnectorSession nativeWriterAlwaysEnabled(ConnectorSession session) - { - return newProxy(ConnectorSession.class, (proxy, method, args) -> { - if (method.getName().equals("getProperty") && ((String) args[0]).endsWith("_native_writer_enabled")) { - return true; - } - return method.invoke(session, args); - }); - } - @Override public RowChangeParadigm getRowChangeParadigm(ConnectorSession session, ConnectorTableHandle tableHandle) { diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveModule.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveModule.java index bb06e634a63ec..c081507cc2892 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveModule.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveModule.java @@ -52,8 +52,6 @@ import io.trino.plugin.hive.parquet.ParquetReaderConfig; import io.trino.plugin.hive.parquet.ParquetWriterConfig; import io.trino.plugin.hive.rcfile.RcFilePageSourceFactory; -import io.trino.plugin.hive.s3select.S3SelectRecordCursorProvider; -import io.trino.plugin.hive.s3select.TrinoS3ClientFactory; import io.trino.spi.connector.ConnectorNodePartitioningProvider; import io.trino.spi.connector.ConnectorPageSinkProvider; import io.trino.spi.connector.ConnectorPageSourceProvider; @@ -88,8 +86,6 @@ public void configure(Binder binder) newOptionalBinder(binder, HiveMaterializedViewPropertiesProvider.class) .setDefault().toInstance(ImmutableList::of); - binder.bind(TrinoS3ClientFactory.class).in(Scopes.SINGLETON); - binder.bind(CachingDirectoryLister.class).in(Scopes.SINGLETON); newExporter(binder).export(CachingDirectoryLister.class).withGeneratedName(); @@ -129,8 +125,6 @@ public void configure(Binder binder) newExporter(binder).export(HdfsNamenodeStats.class) .as(generator -> generator.generatedNameOf(NamenodeStats.class)); - configBinder(binder).bindConfig(HiveFormatsConfig.class); - Multibinder pageSourceFactoryBinder = newSetBinder(binder, HivePageSourceFactory.class); pageSourceFactoryBinder.addBinding().to(CsvPageSourceFactory.class).in(Scopes.SINGLETON); pageSourceFactoryBinder.addBinding().to(JsonPageSourceFactory.class).in(Scopes.SINGLETON); @@ -143,11 +137,6 @@ public void configure(Binder binder) pageSourceFactoryBinder.addBinding().to(RcFilePageSourceFactory.class).in(Scopes.SINGLETON); pageSourceFactoryBinder.addBinding().to(AvroPageSourceFactory.class).in(Scopes.SINGLETON); - Multibinder recordCursorProviderBinder = newSetBinder(binder, HiveRecordCursorProvider.class); - recordCursorProviderBinder.addBinding().to(S3SelectRecordCursorProvider.class).in(Scopes.SINGLETON); - - binder.bind(GenericHiveRecordCursorProvider.class).in(Scopes.SINGLETON); - Multibinder fileWriterFactoryBinder = newSetBinder(binder, HiveFileWriterFactory.class); binder.bind(OrcFileWriterFactory.class).in(Scopes.SINGLETON); newExporter(binder).export(OrcFileWriterFactory.class).withGeneratedName(); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSink.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSink.java index 26e84921435fb..450958abd93e6 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSink.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSink.java @@ -19,10 +19,8 @@ import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListeningExecutorService; -import io.airlift.concurrent.MoreFutures; import io.airlift.json.JsonCodec; import io.airlift.slice.Slice; -import io.trino.hdfs.HdfsEnvironment; import io.trino.plugin.hive.util.HiveBucketing.BucketingVersion; import io.trino.spi.Page; import io.trino.spi.PageIndexer; @@ -54,6 +52,7 @@ import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static io.airlift.concurrent.MoreFutures.toCompletableFuture; import static io.airlift.slice.Slices.wrappedBuffer; import static io.trino.plugin.hive.HiveErrorCode.HIVE_TOO_MANY_OPEN_PARTITIONS; import static io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_CLOSE_ERROR; @@ -78,7 +77,6 @@ public class HivePageSink private final HiveBucketFunction bucketFunction; private final HiveWriterPagePartitioner pagePartitioner; - private final HdfsEnvironment hdfsEnvironment; private final int maxOpenWriters; private final ListeningExecutorService writeVerificationExecutor; @@ -87,8 +85,6 @@ public class HivePageSink private final List writers = new ArrayList<>(); - private final ConnectorSession session; - private final long targetMaxFileSize; private final List closedWriterRollbackActions = new ArrayList<>(); private final List partitionUpdates = new ArrayList<>(); @@ -106,7 +102,6 @@ public HivePageSink( boolean isTransactional, Optional bucketProperty, PageIndexerFactory pageIndexerFactory, - HdfsEnvironment hdfsEnvironment, int maxOpenWriters, ListeningExecutorService writeVerificationExecutor, JsonCodec partitionUpdateCodec, @@ -119,7 +114,6 @@ public HivePageSink( requireNonNull(pageIndexerFactory, "pageIndexerFactory is null"); this.isTransactional = isTransactional; - this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); this.maxOpenWriters = maxOpenWriters; this.writeVerificationExecutor = requireNonNull(writeVerificationExecutor, "writeVerificationExecutor is null"); this.partitionUpdateCodec = requireNonNull(partitionUpdateCodec, "partitionUpdateCodec is null"); @@ -167,7 +161,6 @@ public HivePageSink( bucketFunction = null; } - this.session = requireNonNull(session, "session is null"); this.targetMaxFileSize = HiveSessionProperties.getTargetMaxFileSize(session).toBytes(); } @@ -192,13 +185,7 @@ public long getValidationCpuNanos() @Override public CompletableFuture> finish() { - // Must be wrapped in doAs entirely - // Implicit FileSystem initializations are possible in HiveRecordWriter#commit -> RecordWriter#close - ListenableFuture> result = hdfsEnvironment.doAs( - session.getIdentity(), - isMergeSink ? this::doMergeSinkFinish : this::doInsertSinkFinish); - - return MoreFutures.toCompletableFuture(result); + return toCompletableFuture(isMergeSink ? doMergeSinkFinish() : doInsertSinkFinish()); } private ListenableFuture> doMergeSinkFinish() @@ -244,13 +231,6 @@ private ListenableFuture> doInsertSinkFinish() @Override public void abort() - { - // Must be wrapped in doAs entirely - // Implicit FileSystem initializations are possible in HiveRecordWriter#rollback -> RecordWriter#close - hdfsEnvironment.doAs(session.getIdentity(), this::doAbort); - } - - private void doAbort() { List rollbackActions = Streams.concat( writers.stream() @@ -278,17 +258,6 @@ private void doAbort() @Override public CompletableFuture appendPage(Page page) - { - if (page.getPositionCount() > 0) { - // Must be wrapped in doAs entirely - // Implicit FileSystem initializations are possible in HiveRecordWriter#addRow or #createWriter - hdfsEnvironment.doAs(session.getIdentity(), () -> doAppend(page)); - } - - return NOT_BLOCKED; - } - - private void doAppend(Page page) { int writeOffset = 0; while (writeOffset < page.getPositionCount()) { @@ -296,6 +265,7 @@ private void doAppend(Page page) writeOffset += chunk.getPositionCount(); writePage(chunk); } + return NOT_BLOCKED; } private void writePage(Page page) diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSinkProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSinkProvider.java index cd0f2032e4ca3..e772b27ad6cab 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSinkProvider.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSinkProvider.java @@ -22,7 +22,6 @@ import io.airlift.json.JsonCodec; import io.airlift.units.DataSize; import io.trino.filesystem.TrinoFileSystemFactory; -import io.trino.hdfs.HdfsEnvironment; import io.trino.plugin.hive.metastore.HiveMetastoreFactory; import io.trino.plugin.hive.metastore.HivePageSinkMetadataProvider; import io.trino.plugin.hive.metastore.SortingColumn; @@ -40,7 +39,6 @@ import io.trino.spi.connector.ConnectorTableExecuteHandle; import io.trino.spi.connector.ConnectorTransactionHandle; import io.trino.spi.type.TypeManager; -import org.joda.time.DateTimeZone; import java.util.List; import java.util.Map; @@ -60,7 +58,6 @@ public class HivePageSinkProvider { private final Set fileWriterFactories; private final TrinoFileSystemFactory fileSystemFactory; - private final HdfsEnvironment hdfsEnvironment; private final PageSorter pageSorter; private final HiveMetastoreFactory metastoreFactory; private final PageIndexerFactory pageIndexerFactory; @@ -76,7 +73,6 @@ public class HivePageSinkProvider private final HiveSessionProperties hiveSessionProperties; private final HiveWriterStats hiveWriterStats; private final long perTransactionMetastoreCacheMaximumSize; - private final DateTimeZone parquetTimeZone; private final boolean temporaryStagingDirectoryDirectoryEnabled; private final String temporaryStagingDirectoryPath; @@ -84,7 +80,6 @@ public class HivePageSinkProvider public HivePageSinkProvider( Set fileWriterFactories, TrinoFileSystemFactory fileSystemFactory, - HdfsEnvironment hdfsEnvironment, PageSorter pageSorter, HiveMetastoreFactory metastoreFactory, PageIndexerFactory pageIndexerFactory, @@ -100,7 +95,6 @@ public HivePageSinkProvider( { this.fileWriterFactories = ImmutableSet.copyOf(requireNonNull(fileWriterFactories, "fileWriterFactories is null")); this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); - this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); this.pageSorter = requireNonNull(pageSorter, "pageSorter is null"); this.metastoreFactory = requireNonNull(metastoreFactory, "metastoreFactory is null"); this.pageIndexerFactory = requireNonNull(pageIndexerFactory, "pageIndexerFactory is null"); @@ -116,7 +110,6 @@ public HivePageSinkProvider( this.hiveSessionProperties = requireNonNull(hiveSessionProperties, "hiveSessionProperties is null"); this.hiveWriterStats = requireNonNull(hiveWriterStats, "hiveWriterStats is null"); this.perTransactionMetastoreCacheMaximumSize = config.getPerTransactionMetastoreCacheMaximumSize(); - this.parquetTimeZone = config.getParquetDateTimeZone(); this.temporaryStagingDirectoryDirectoryEnabled = config.isTemporaryStagingDirectoryEnabled(); this.temporaryStagingDirectoryPath = config.getTemporaryStagingDirectoryPath(); } @@ -181,11 +174,9 @@ private HivePageSink createPageSink(HiveWritableTableHandle handle, boolean isCr handle.getPageSinkMetadata(), new HiveMetastoreClosure(memoizeMetastore(metastoreFactory.createMetastore(Optional.of(session.getIdentity())), perTransactionMetastoreCacheMaximumSize))), typeManager, - hdfsEnvironment, pageSorter, writerSortBufferSize, maxOpenSortFiles, - parquetTimeZone, session, nodeManager, eventClient, @@ -201,7 +192,6 @@ private HivePageSink createPageSink(HiveWritableTableHandle handle, boolean isCr handle.isTransactional(), handle.getBucketProperty(), pageIndexerFactory, - hdfsEnvironment, maxOpenPartitions, writeVerificationExecutor, partitionUpdateCodec, diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSource.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSource.java index 947a299fd593c..2556bca48fb32 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSource.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSource.java @@ -27,7 +27,6 @@ import io.trino.spi.block.LazyBlockLoader; import io.trino.spi.block.RunLengthEncodedBlock; import io.trino.spi.connector.ConnectorPageSource; -import io.trino.spi.connector.RecordCursor; import io.trino.spi.metrics.Metrics; import io.trino.spi.type.Type; import io.trino.spi.type.TypeManager; @@ -369,20 +368,5 @@ public void validate(Page page) } } } - - public RecordCursor wrapRecordCursor(RecordCursor delegate, TypeManager typeManager) - { - return new HiveBucketValidationRecordCursor( - path, - bucketColumnIndices, - bucketColumnTypes.stream() - .map(HiveType::toHiveType) - .collect(toImmutableList()), - bucketingVersion, - bucketCount, - expectedBucket, - typeManager, - delegate); - } } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSourceProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSourceProvider.java index f7871ebbd5fa9..709cd854d9716 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSourceProvider.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HivePageSourceProvider.java @@ -19,15 +19,13 @@ import com.google.common.collect.ImmutableSet; import com.google.inject.Inject; import io.trino.filesystem.Location; -import io.trino.hdfs.HdfsContext; -import io.trino.hdfs.HdfsEnvironment; import io.trino.plugin.hive.HivePageSource.BucketValidator; -import io.trino.plugin.hive.HiveRecordCursorProvider.ReaderRecordCursorWithProjections; import io.trino.plugin.hive.HiveSplit.BucketConversion; import io.trino.plugin.hive.HiveSplit.BucketValidation; import io.trino.plugin.hive.acid.AcidTransaction; import io.trino.plugin.hive.type.TypeInfo; import io.trino.plugin.hive.util.HiveBucketing.BucketingVersion; +import io.trino.spi.TrinoException; import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ConnectorPageSource; import io.trino.spi.connector.ConnectorPageSourceProvider; @@ -37,15 +35,10 @@ import io.trino.spi.connector.ConnectorTransactionHandle; import io.trino.spi.connector.DynamicFilter; import io.trino.spi.connector.EmptyPageSource; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.connector.RecordPageSource; import io.trino.spi.predicate.Domain; import io.trino.spi.predicate.NullableValue; import io.trino.spi.predicate.TupleDomain; -import io.trino.spi.type.Type; import io.trino.spi.type.TypeManager; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; import java.util.ArrayList; import java.util.HashMap; @@ -68,12 +61,15 @@ import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.SYNTHESIZED; import static io.trino.plugin.hive.HiveColumnHandle.isRowIdColumnHandle; +import static io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT; import static io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping.toColumnHandles; import static io.trino.plugin.hive.HivePageSourceProvider.ColumnMappingKind.PREFILLED; import static io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision; import static io.trino.plugin.hive.coercions.CoercionUtils.createTypeFromCoercer; import static io.trino.plugin.hive.util.HiveBucketing.HiveBucketFilter; import static io.trino.plugin.hive.util.HiveBucketing.getHiveBucketFilter; +import static io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName; +import static io.trino.plugin.hive.util.HiveUtil.getInputFormatName; import static io.trino.plugin.hive.util.HiveUtil.getPrefilledColumnValue; import static java.util.Objects.requireNonNull; import static java.util.function.Function.identity; @@ -86,28 +82,15 @@ public class HivePageSourceProvider private static final Pattern ORIGINAL_FILE_PATH_MATCHER = Pattern.compile("(?s)(?.*)/(?(?\\d+)_(?.*)?)$"); private final TypeManager typeManager; - private final HdfsEnvironment hdfsEnvironment; private final int domainCompactionThreshold; private final Set pageSourceFactories; - private final Set cursorProviders; @Inject - public HivePageSourceProvider( - TypeManager typeManager, - HdfsEnvironment hdfsEnvironment, - HiveConfig hiveConfig, - Set pageSourceFactories, - Set cursorProviders, - GenericHiveRecordCursorProvider genericCursorProvider) + public HivePageSourceProvider(TypeManager typeManager, HiveConfig hiveConfig, Set pageSourceFactories) { this.typeManager = requireNonNull(typeManager, "typeManager is null"); - this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); this.domainCompactionThreshold = hiveConfig.getDomainCompactionThreshold(); this.pageSourceFactories = ImmutableSet.copyOf(requireNonNull(pageSourceFactories, "pageSourceFactories is null")); - this.cursorProviders = ImmutableSet.builder() - .addAll(requireNonNull(cursorProviders, "cursorProviders is null")) - .add(genericCursorProvider) // generic should be last, as a fallback option - .build(); } @Override @@ -149,12 +132,8 @@ public ConnectorPageSource createPageSource( return new EmptyPageSource(); } - Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsContext(session), new Path(hiveSplit.getPath())); - Optional pageSource = createHivePageSource( pageSourceFactories, - cursorProviders, - configuration, session, Location.of(hiveSplit.getPath()), hiveSplit.getTableBucketNumber(), @@ -165,11 +144,9 @@ public ConnectorPageSource createPageSource( hiveTable.getCompactEffectivePredicate().intersect( dynamicFilter.getCurrentPredicate().transformKeys(HiveColumnHandle.class::cast)) .simplify(domainCompactionThreshold), - hiveColumns, typeManager, hiveSplit.getBucketConversion(), hiveSplit.getBucketValidation(), - hiveSplit.isS3SelectPushdownEnabled(), hiveSplit.getAcidInfo(), originalFile, hiveTable.getTransaction(), @@ -178,13 +155,16 @@ public ConnectorPageSource createPageSource( if (pageSource.isPresent()) { return pageSource.get(); } - throw new RuntimeException("Could not find a file reader for split " + hiveSplit); + + throw new TrinoException(HIVE_UNSUPPORTED_FORMAT, "Unsupported input format: serde=%s, format=%s, partition=%s, path=%s".formatted( + getDeserializerClassName(hiveSplit.getSchema()), + getInputFormatName(hiveSplit.getSchema()).orElse(null), + hiveSplit.getPartitionName(), + hiveSplit.getPath())); } public static Optional createHivePageSource( Set pageSourceFactories, - Set cursorProviders, - Configuration configuration, ConnectorSession session, Location path, OptionalInt tableBucketNumber, @@ -193,11 +173,9 @@ public static Optional createHivePageSource( long estimatedFileSize, Properties schema, TupleDomain effectivePredicate, - List columns, TypeManager typeManager, Optional bucketConversion, Optional bucketValidation, - boolean s3SelectPushdownEnabled, Optional acidInfo, boolean originalFile, AcidTransaction transaction, @@ -215,7 +193,7 @@ public static Optional createHivePageSource( HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session); for (HivePageSourceFactory pageSourceFactory : pageSourceFactories) { - List desiredColumns = toColumnHandles(regularAndInterimColumnMappings, true, typeManager, timestampPrecision); + List desiredColumns = toColumnHandles(regularAndInterimColumnMappings, typeManager, timestampPrecision); Optional readerWithProjections = pageSourceFactory.createPageSource( session, @@ -251,58 +229,6 @@ public static Optional createHivePageSource( } } - for (HiveRecordCursorProvider provider : cursorProviders) { - List desiredColumns = toColumnHandles(regularAndInterimColumnMappings, false, typeManager, timestampPrecision); - Optional readerWithProjections = provider.createRecordCursor( - configuration, - session, - path, - start, - length, - estimatedFileSize, - schema, - desiredColumns, - effectivePredicate, - typeManager, - s3SelectPushdownEnabled); - - if (readerWithProjections.isPresent()) { - RecordCursor delegate = readerWithProjections.get().getRecordCursor(); - Optional projections = readerWithProjections.get().getProjectedReaderColumns(); - - if (projections.isPresent()) { - ReaderProjectionsAdapter projectionsAdapter = hiveProjectionsAdapter(desiredColumns, projections.get()); - delegate = new HiveReaderProjectionsAdaptingRecordCursor(delegate, projectionsAdapter); - } - - checkArgument(acidInfo.isEmpty(), "Acid is not supported"); - - if (bucketAdaptation.isPresent()) { - delegate = new HiveBucketAdapterRecordCursor( - bucketAdaptation.get().getBucketColumnIndices(), - bucketAdaptation.get().getBucketColumnHiveTypes(), - bucketAdaptation.get().getBucketingVersion(), - bucketAdaptation.get().getTableBucketCount(), - bucketAdaptation.get().getPartitionBucketCount(), - bucketAdaptation.get().getBucketToKeep(), - typeManager, - delegate); - } - - // bucket adaptation already validates that data is in the right bucket - if (bucketAdaptation.isEmpty() && bucketValidator.isPresent()) { - delegate = bucketValidator.get().wrapRecordCursor(delegate, typeManager); - } - - HiveRecordCursor hiveRecordCursor = new HiveRecordCursor(columnMappings, delegate); - List columnTypes = columns.stream() - .map(HiveColumnHandle::getType) - .collect(toList()); - - return Optional.of(new RecordPageSource(columnTypes, hiveRecordCursor)); - } - } - return Optional.empty(); } @@ -547,12 +473,12 @@ public static List extractRegularAndInterimColumnMappings(List toColumnHandles(List regularColumnMappings, boolean doCoercion, TypeManager typeManager, HiveTimestampPrecision timestampPrecision) + public static List toColumnHandles(List regularColumnMappings, TypeManager typeManager, HiveTimestampPrecision timestampPrecision) { return regularColumnMappings.stream() .map(columnMapping -> { HiveColumnHandle columnHandle = columnMapping.getHiveColumnHandle(); - if (!doCoercion || columnMapping.getBaseTypeCoercionFrom().isEmpty()) { + if (columnMapping.getBaseTypeCoercionFrom().isEmpty()) { return columnHandle; } HiveType fromHiveTypeBase = columnMapping.getBaseTypeCoercionFrom().get(); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveReaderProjectionsAdaptingRecordCursor.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveReaderProjectionsAdaptingRecordCursor.java deleted file mode 100644 index 4b22c13261b95..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveReaderProjectionsAdaptingRecordCursor.java +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import com.google.common.collect.Iterables; -import io.airlift.slice.Slice; -import io.trino.plugin.hive.ReaderProjectionsAdapter.ChannelMapping; -import io.trino.plugin.hive.util.ForwardingRecordCursor; -import io.trino.spi.block.Block; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.type.Type; - -import java.util.List; - -import static com.google.common.base.Preconditions.checkArgument; -import static java.util.Objects.requireNonNull; - -/** - * Applies projections on delegate fields provided by {@link ChannelMapping} to produce fields expected from this cursor. - */ -public class HiveReaderProjectionsAdaptingRecordCursor - extends ForwardingRecordCursor -{ - private final RecordCursor delegate; - private final ChannelMapping[] channelMappings; - private final Type[] outputTypes; - private final Type[] inputTypes; - - private final Type[] baseTypes; - - public HiveReaderProjectionsAdaptingRecordCursor(RecordCursor delegate, ReaderProjectionsAdapter projectionsAdapter) - { - this.delegate = requireNonNull(delegate, "delegate is null"); - requireNonNull(projectionsAdapter, "projectionsAdapter is null"); - - this.channelMappings = new ChannelMapping[projectionsAdapter.getOutputToInputMapping().size()]; - projectionsAdapter.getOutputToInputMapping().toArray(channelMappings); - - this.outputTypes = new Type[projectionsAdapter.getOutputTypes().size()]; - projectionsAdapter.getOutputTypes().toArray(outputTypes); - - this.inputTypes = new Type[projectionsAdapter.getInputTypes().size()]; - projectionsAdapter.getInputTypes().toArray(inputTypes); - - this.baseTypes = new Type[outputTypes.length]; - for (int i = 0; i < baseTypes.length; i++) { - Type type = inputTypes[channelMappings[i].getInputChannelIndex()]; - List dereferences = channelMappings[i].getDereferenceSequence(); - for (int j = 0; j < dereferences.size(); j++) { - type = type.getTypeParameters().get(dereferences.get(j)); - } - baseTypes[i] = type; - } - } - - @Override - protected RecordCursor delegate() - { - return delegate; - } - - @Override - public Type getType(int field) - { - return outputTypes[field]; - } - - private Block applyDereferences(Block baseObject, List dereferences, int length) - { - checkArgument(length <= dereferences.size()); - Block current = baseObject; - for (int i = 0; i < length; i++) { - current = current.getObject(dereferences.get(i), Block.class); - } - return current; - } - - @Override - public boolean getBoolean(int field) - { - int inputFieldIndex = channelMappings[field].getInputChannelIndex(); - List dereferences = channelMappings[field].getDereferenceSequence(); - - if (dereferences.isEmpty()) { - return delegate.getBoolean(inputFieldIndex); - } - - // Get SingleRowBlock corresponding to the element at current position - Block elementBlock = (Block) delegate.getObject(inputFieldIndex); - - // Apply dereferences except for the last one, which is type dependent - Block baseObject = applyDereferences(elementBlock, dereferences, dereferences.size() - 1); - - return baseTypes[field].getBoolean(baseObject, Iterables.getLast(dereferences)); - } - - @Override - public long getLong(int field) - { - int inputFieldIndex = channelMappings[field].getInputChannelIndex(); - List dereferences = channelMappings[field].getDereferenceSequence(); - - if (dereferences.isEmpty()) { - return delegate.getLong(inputFieldIndex); - } - - // Get SingleRowBlock corresponding to the element at current position - Block elementBlock = (Block) delegate.getObject(inputFieldIndex); - - // Apply dereferences except for the last one, which is type dependent - Block baseObject = applyDereferences(elementBlock, dereferences, dereferences.size() - 1); - - return baseTypes[field].getLong(baseObject, Iterables.getLast(dereferences)); - } - - @Override - public double getDouble(int field) - { - int inputFieldIndex = channelMappings[field].getInputChannelIndex(); - List dereferences = channelMappings[field].getDereferenceSequence(); - - if (dereferences.isEmpty()) { - return delegate.getDouble(inputFieldIndex); - } - - // Get SingleRowBlock corresponding to the element at current position - Block elementBlock = (Block) delegate.getObject(inputFieldIndex); - - // Apply dereferences except for the last one, which is type dependent - Block baseObject = applyDereferences(elementBlock, dereferences, dereferences.size() - 1); - - return baseTypes[field].getDouble(baseObject, Iterables.getLast(dereferences)); - } - - @Override - public Slice getSlice(int field) - { - int inputFieldIndex = channelMappings[field].getInputChannelIndex(); - List dereferences = channelMappings[field].getDereferenceSequence(); - - if (dereferences.isEmpty()) { - return delegate.getSlice(inputFieldIndex); - } - - // Get SingleRowBlock corresponding to the element at current position - Block elementBlock = (Block) delegate.getObject(inputFieldIndex); - - // Apply dereferences except for the last one, which is type dependent - Block baseObject = applyDereferences(elementBlock, dereferences, dereferences.size() - 1); - - return baseTypes[field].getSlice(baseObject, Iterables.getLast(dereferences)); - } - - @Override - public Object getObject(int field) - { - int inputFieldIndex = channelMappings[field].getInputChannelIndex(); - List dereferences = channelMappings[field].getDereferenceSequence(); - - if (dereferences.isEmpty()) { - return delegate.getObject(inputFieldIndex); - } - - // Get SingleRowBlock corresponding to the element at current position - Block elementBlock = (Block) delegate.getObject(inputFieldIndex); - - // Apply dereferences except for the last one, which is type dependent - Block baseObject = applyDereferences(elementBlock, dereferences, dereferences.size() - 1); - - return baseTypes[field].getObject(baseObject, Iterables.getLast(dereferences)); - } - - @Override - public boolean isNull(int field) - { - int inputFieldIndex = channelMappings[field].getInputChannelIndex(); - List dereferences = channelMappings[field].getDereferenceSequence(); - - if (dereferences.isEmpty()) { - return delegate.isNull(inputFieldIndex); - } - - if (delegate.isNull(inputFieldIndex)) { - return true; - } - - // Get SingleRowBlock corresponding to the element at current position - Block baseObject = (Block) delegate.getObject(inputFieldIndex); - - for (int j = 0; j < dereferences.size() - 1; j++) { - int dereferenceIndex = dereferences.get(j); - if (baseObject.isNull(dereferenceIndex)) { - return true; - } - baseObject = baseObject.getObject(dereferenceIndex, Block.class); - } - - int finalDereference = Iterables.getLast(dereferences); - return baseObject.isNull(finalDereference); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveRecordCursor.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveRecordCursor.java deleted file mode 100644 index 172f8f4808497..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveRecordCursor.java +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import com.google.common.annotations.VisibleForTesting; -import io.airlift.slice.Slice; -import io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping; -import io.trino.plugin.hive.util.ForwardingRecordCursor; -import io.trino.spi.TrinoException; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.type.CharType; -import io.trino.spi.type.DecimalType; -import io.trino.spi.type.Type; -import io.trino.spi.type.VarcharType; - -import java.util.List; - -import static io.trino.plugin.hive.HivePageSourceProvider.ColumnMappingKind.EMPTY; -import static io.trino.plugin.hive.HivePageSourceProvider.ColumnMappingKind.PREFILLED; -import static io.trino.plugin.hive.HivePageSourceProvider.ColumnMappingKind.REGULAR; -import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.spi.type.BooleanType.BOOLEAN; -import static io.trino.spi.type.DateType.DATE; -import static io.trino.spi.type.DoubleType.DOUBLE; -import static io.trino.spi.type.IntegerType.INTEGER; -import static io.trino.spi.type.RealType.REAL; -import static io.trino.spi.type.SmallintType.SMALLINT; -import static io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS; -import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS; -import static io.trino.spi.type.TinyintType.TINYINT; -import static java.lang.String.format; -import static java.util.Objects.requireNonNull; - -public class HiveRecordCursor - extends ForwardingRecordCursor -{ - private final RecordCursor delegate; - - private final List columnMappings; - private final Type[] types; - - private final boolean[] booleans; - private final long[] longs; - private final double[] doubles; - private final Slice[] slices; - private final Object[] objects; - private final boolean[] nulls; - - public HiveRecordCursor(List columnMappings, RecordCursor delegate) - { - requireNonNull(columnMappings, "columnMappings is null"); - - this.delegate = requireNonNull(delegate, "delegate is null"); - this.columnMappings = columnMappings; - - int size = columnMappings.size(); - - this.types = new Type[size]; - - this.booleans = new boolean[size]; - this.longs = new long[size]; - this.doubles = new double[size]; - this.slices = new Slice[size]; - this.objects = new Object[size]; - this.nulls = new boolean[size]; - - for (int columnIndex = 0; columnIndex < size; columnIndex++) { - ColumnMapping columnMapping = columnMappings.get(columnIndex); - - if (columnMapping.getKind() == EMPTY) { - nulls[columnIndex] = true; - } - if (columnMapping.getKind() == PREFILLED) { - Object prefilledValue = columnMapping.getPrefilledValue().getValue(); - String name = columnMapping.getHiveColumnHandle().getName(); - Type type = columnMapping.getHiveColumnHandle().getType(); - types[columnIndex] = type; - - if (prefilledValue == null) { - nulls[columnIndex] = true; - } - else if (BOOLEAN.equals(type)) { - booleans[columnIndex] = (boolean) prefilledValue; - } - else if (TINYINT.equals(type)) { - longs[columnIndex] = (long) prefilledValue; - } - else if (SMALLINT.equals(type)) { - longs[columnIndex] = (long) prefilledValue; - } - else if (INTEGER.equals(type)) { - longs[columnIndex] = (long) prefilledValue; - } - else if (BIGINT.equals(type)) { - longs[columnIndex] = (long) prefilledValue; - } - else if (REAL.equals(type)) { - longs[columnIndex] = (long) prefilledValue; - } - else if (DOUBLE.equals(type)) { - doubles[columnIndex] = (double) prefilledValue; - } - else if (type instanceof VarcharType) { - slices[columnIndex] = (Slice) prefilledValue; - } - else if (type instanceof CharType) { - slices[columnIndex] = (Slice) prefilledValue; - } - else if (DATE.equals(type)) { - longs[columnIndex] = (long) prefilledValue; - } - else if (TIMESTAMP_MILLIS.equals(type)) { - longs[columnIndex] = (long) prefilledValue; - } - else if (TIMESTAMP_TZ_MILLIS.equals(type)) { - longs[columnIndex] = (long) prefilledValue; - } - else if (type instanceof DecimalType decimalType && decimalType.isShort()) { - longs[columnIndex] = (long) prefilledValue; - } - else if (type instanceof DecimalType decimalType && !decimalType.isShort()) { - objects[columnIndex] = prefilledValue; - } - else { - throw new TrinoException(NOT_SUPPORTED, format("Unsupported column type %s for prefilled column: %s", type.getDisplayName(), name)); - } - } - } - } - - @Override - protected RecordCursor delegate() - { - return delegate; - } - - @Override - public Type getType(int field) - { - return types[field]; - } - - @Override - public boolean getBoolean(int field) - { - ColumnMapping columnMapping = columnMappings.get(field); - if (columnMapping.getKind() == REGULAR) { - return delegate.getBoolean(columnMapping.getIndex()); - } - return booleans[field]; - } - - @Override - public long getLong(int field) - { - ColumnMapping columnMapping = columnMappings.get(field); - if (columnMapping.getKind() == REGULAR) { - return delegate.getLong(columnMapping.getIndex()); - } - return longs[field]; - } - - @Override - public double getDouble(int field) - { - ColumnMapping columnMapping = columnMappings.get(field); - if (columnMapping.getKind() == REGULAR) { - return delegate.getDouble(columnMapping.getIndex()); - } - return doubles[field]; - } - - @Override - public Slice getSlice(int field) - { - ColumnMapping columnMapping = columnMappings.get(field); - if (columnMapping.getKind() == REGULAR) { - return delegate.getSlice(columnMapping.getIndex()); - } - return slices[field]; - } - - @Override - public Object getObject(int field) - { - ColumnMapping columnMapping = columnMappings.get(field); - if (columnMapping.getKind() == REGULAR) { - return delegate.getObject(columnMapping.getIndex()); - } - return objects[field]; - } - - @Override - public boolean isNull(int field) - { - ColumnMapping columnMapping = columnMappings.get(field); - if (columnMapping.getKind() == REGULAR) { - return delegate.isNull(columnMapping.getIndex()); - } - return nulls[field]; - } - - @VisibleForTesting - RecordCursor getRegularColumnRecordCursor() - { - return delegate; - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveRecordCursorProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveRecordCursorProvider.java deleted file mode 100644 index aad75d2216101..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveRecordCursorProvider.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import io.trino.filesystem.Location; -import io.trino.spi.connector.ConnectorSession; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.predicate.TupleDomain; -import io.trino.spi.type.TypeManager; -import org.apache.hadoop.conf.Configuration; - -import java.util.List; -import java.util.Optional; -import java.util.Properties; - -import static java.util.Objects.requireNonNull; - -public interface HiveRecordCursorProvider -{ - Optional createRecordCursor( - Configuration configuration, - ConnectorSession session, - Location path, - long start, - long length, - long fileSize, - Properties schema, - List columns, - TupleDomain effectivePredicate, - TypeManager typeManager, - boolean s3SelectPushdownEnabled); - - /** - * A wrapper class for - * - delegate reader record cursor and - * - projection information for columns to be returned by the delegate - *

- * Empty {@param projectedReaderColumns} indicates that the delegate cursor reads the exact same columns provided to - * it in {@link HiveRecordCursorProvider#createRecordCursor} - */ - class ReaderRecordCursorWithProjections - { - private final RecordCursor recordCursor; - private final Optional projectedReaderColumns; - - public ReaderRecordCursorWithProjections(RecordCursor recordCursor, Optional projectedReaderColumns) - { - this.recordCursor = requireNonNull(recordCursor, "recordCursor is null"); - this.projectedReaderColumns = requireNonNull(projectedReaderColumns, "projectedReaderColumns is null"); - } - - public RecordCursor getRecordCursor() - { - return recordCursor; - } - - public Optional getProjectedReaderColumns() - { - return projectedReaderColumns; - } - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSessionProperties.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSessionProperties.java index 570d444b3d611..907388434b94b 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSessionProperties.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSessionProperties.java @@ -65,19 +65,6 @@ public final class HiveSessionProperties private static final String PARALLEL_PARTITIONED_BUCKETED_WRITES = "parallel_partitioned_bucketed_writes"; private static final String FORCE_LOCAL_SCHEDULING = "force_local_scheduling"; private static final String INSERT_EXISTING_PARTITIONS_BEHAVIOR = "insert_existing_partitions_behavior"; - private static final String AVRO_NATIVE_READER_ENABLED = "avro_native_reader_enabled"; - private static final String AVRO_NATIVE_WRITER_ENABLED = "avro_native_writer_enabled"; - private static final String CSV_NATIVE_READER_ENABLED = "csv_native_reader_enabled"; - private static final String CSV_NATIVE_WRITER_ENABLED = "csv_native_writer_enabled"; - private static final String JSON_NATIVE_READER_ENABLED = "json_native_reader_enabled"; - private static final String JSON_NATIVE_WRITER_ENABLED = "json_native_writer_enabled"; - private static final String OPENX_JSON_NATIVE_READER_ENABLED = "openx_json_native_reader_enabled"; - private static final String OPENX_JSON_NATIVE_WRITER_ENABLED = "openx_json_native_writer_enabled"; - private static final String REGEX_NATIVE_READER_ENABLED = "regex_native_reader_enabled"; - private static final String TEXT_FILE_NATIVE_READER_ENABLED = "text_file_native_reader_enabled"; - private static final String TEXT_FILE_NATIVE_WRITER_ENABLED = "text_file_native_writer_enabled"; - private static final String SEQUENCE_FILE_NATIVE_READER_ENABLED = "sequence_file_native_reader_enabled"; - private static final String SEQUENCE_FILE_NATIVE_WRITER_ENABLED = "sequence_file_native_writer_enabled"; private static final String ORC_BLOOM_FILTERS_ENABLED = "orc_bloom_filters_enabled"; private static final String ORC_MAX_MERGE_DISTANCE = "orc_max_merge_distance"; private static final String ORC_MAX_BUFFER_SIZE = "orc_max_buffer_size"; @@ -119,7 +106,6 @@ public final class HiveSessionProperties private static final String IGNORE_CORRUPTED_STATISTICS = "ignore_corrupted_statistics"; private static final String COLLECT_COLUMN_STATISTICS_ON_WRITE = "collect_column_statistics_on_write"; private static final String OPTIMIZE_MISMATCHED_BUCKET_COUNT = "optimize_mismatched_bucket_count"; - private static final String S3_SELECT_PUSHDOWN_ENABLED = "s3_select_pushdown_enabled"; private static final String DELEGATE_TRANSACTIONAL_MANAGED_TABLE_LOCATION_TO_METASTORE = "delegate_transactional_managed_table_location_to_metastore"; private static final String IGNORE_ABSENT_PARTITIONS = "ignore_absent_partitions"; private static final String QUERY_PARTITION_FILTER_REQUIRED = "query_partition_filter_required"; @@ -127,7 +113,6 @@ public final class HiveSessionProperties private static final String PROJECTION_PUSHDOWN_ENABLED = "projection_pushdown_enabled"; private static final String TIMESTAMP_PRECISION = "timestamp_precision"; private static final String DYNAMIC_FILTERING_WAIT_TIMEOUT = "dynamic_filtering_wait_timeout"; - private static final String OPTIMIZE_SYMLINK_LISTING = "optimize_symlink_listing"; private static final String HIVE_VIEWS_LEGACY_TRANSLATION = "hive_views_legacy_translation"; private static final String ICEBERG_CATALOG_NAME = "iceberg_catalog_name"; public static final String DELTA_LAKE_CATALOG_NAME = "delta_lake_catalog_name"; @@ -161,7 +146,6 @@ static boolean isValid(InsertExistingPartitionsBehavior value, boolean immutable @Inject public HiveSessionProperties( HiveConfig hiveConfig, - HiveFormatsConfig hiveFormatsConfig, OrcReaderConfig orcReaderConfig, OrcWriterConfig orcWriterConfig, ParquetReaderConfig parquetReaderConfig, @@ -202,71 +186,6 @@ public HiveSessionProperties( false, value -> InsertExistingPartitionsBehavior.valueOf((String) value, hiveConfig.isImmutablePartitions()), InsertExistingPartitionsBehavior::toString), - booleanProperty( - AVRO_NATIVE_READER_ENABLED, - "Use native Avro file reader", - hiveFormatsConfig.isAvroFileNativeReaderEnabled(), - false), - booleanProperty( - AVRO_NATIVE_WRITER_ENABLED, - "Use native Avro file writer", - hiveFormatsConfig.isAvroFileNativeWriterEnabled(), - false), - booleanProperty( - CSV_NATIVE_READER_ENABLED, - "Use native CSV reader", - hiveFormatsConfig.isCsvNativeReaderEnabled(), - false), - booleanProperty( - CSV_NATIVE_WRITER_ENABLED, - "Use native CSV writer", - hiveFormatsConfig.isCsvNativeWriterEnabled(), - false), - booleanProperty( - JSON_NATIVE_READER_ENABLED, - "Use native JSON reader", - hiveFormatsConfig.isJsonNativeReaderEnabled(), - false), - booleanProperty( - JSON_NATIVE_WRITER_ENABLED, - "Use native JSON writer", - hiveFormatsConfig.isJsonNativeWriterEnabled(), - false), - booleanProperty( - OPENX_JSON_NATIVE_READER_ENABLED, - "Use native OpenX JSON reader", - hiveFormatsConfig.isOpenXJsonNativeReaderEnabled(), - false), - booleanProperty( - OPENX_JSON_NATIVE_WRITER_ENABLED, - "Use native OpenX JSON writer", - hiveFormatsConfig.isOpenXJsonNativeWriterEnabled(), - false), - booleanProperty( - REGEX_NATIVE_READER_ENABLED, - "Use native REGEX reader", - hiveFormatsConfig.isRegexNativeReaderEnabled(), - false), - booleanProperty( - TEXT_FILE_NATIVE_READER_ENABLED, - "Use native text file reader", - hiveFormatsConfig.isTextFileNativeReaderEnabled(), - false), - booleanProperty( - TEXT_FILE_NATIVE_WRITER_ENABLED, - "Use native text file writer", - hiveFormatsConfig.isTextFileNativeWriterEnabled(), - false), - booleanProperty( - SEQUENCE_FILE_NATIVE_READER_ENABLED, - "Use native sequence file reader", - hiveFormatsConfig.isSequenceFileNativeReaderEnabled(), - false), - booleanProperty( - SEQUENCE_FILE_NATIVE_WRITER_ENABLED, - "Use native sequence file writer", - hiveFormatsConfig.isSequenceFileNativeWriterEnabled(), - false), booleanProperty( ORC_BLOOM_FILTERS_ENABLED, "ORC: Enable bloom filters for predicate pushdown", @@ -511,11 +430,6 @@ public HiveSessionProperties( "Experimental: Enable optimization to avoid shuffle when bucket count is compatible but not the same", hiveConfig.isOptimizeMismatchedBucketCount(), false), - booleanProperty( - S3_SELECT_PUSHDOWN_ENABLED, - "S3 Select pushdown enabled", - hiveConfig.isS3SelectPushdownEnabled(), - false), booleanProperty( DELEGATE_TRANSACTIONAL_MANAGED_TABLE_LOCATION_TO_METASTORE, "When transactional managed table is created via Trino the location will not be set in request sent to HMS and location will be determined by metastore; if this property is set to true CREATE TABLE AS queries are not supported.", @@ -564,11 +478,6 @@ public HiveSessionProperties( "Duration to wait for completion of dynamic filters during split generation", hiveConfig.getDynamicFilteringWaitTimeout(), false), - booleanProperty( - OPTIMIZE_SYMLINK_LISTING, - "Optimize listing for SymlinkTextFormat tables with files in a single directory", - hiveConfig.isOptimizeSymlinkListing(), - false), booleanProperty( HIVE_VIEWS_LEGACY_TRANSLATION, "Use legacy Hive view translation mechanism", @@ -653,71 +562,6 @@ public static InsertExistingPartitionsBehavior getInsertExistingPartitionsBehavi return session.getProperty(INSERT_EXISTING_PARTITIONS_BEHAVIOR, InsertExistingPartitionsBehavior.class); } - public static boolean isAvroNativeReaderEnabled(ConnectorSession session) - { - return session.getProperty(AVRO_NATIVE_READER_ENABLED, Boolean.class); - } - - public static boolean isAvroNativeWriterEnabled(ConnectorSession session) - { - return session.getProperty(AVRO_NATIVE_WRITER_ENABLED, Boolean.class); - } - - public static boolean isCsvNativeReaderEnabled(ConnectorSession session) - { - return session.getProperty(CSV_NATIVE_READER_ENABLED, Boolean.class); - } - - public static boolean isCsvNativeWriterEnabled(ConnectorSession session) - { - return session.getProperty(CSV_NATIVE_WRITER_ENABLED, Boolean.class); - } - - public static boolean isJsonNativeReaderEnabled(ConnectorSession session) - { - return session.getProperty(JSON_NATIVE_READER_ENABLED, Boolean.class); - } - - public static boolean isJsonNativeWriterEnabled(ConnectorSession session) - { - return session.getProperty(JSON_NATIVE_WRITER_ENABLED, Boolean.class); - } - - public static boolean isOpenXJsonNativeReaderEnabled(ConnectorSession session) - { - return session.getProperty(OPENX_JSON_NATIVE_READER_ENABLED, Boolean.class); - } - - public static boolean isOpenXJsonNativeWriterEnabled(ConnectorSession session) - { - return session.getProperty(OPENX_JSON_NATIVE_WRITER_ENABLED, Boolean.class); - } - - public static boolean isRegexNativeReaderEnabled(ConnectorSession session) - { - return session.getProperty(REGEX_NATIVE_READER_ENABLED, Boolean.class); - } - - public static boolean isTextFileNativeReaderEnabled(ConnectorSession session) - { - return session.getProperty(TEXT_FILE_NATIVE_READER_ENABLED, Boolean.class); - } - - public static boolean isTextFileNativeWriterEnabled(ConnectorSession session) - { - return session.getProperty(TEXT_FILE_NATIVE_WRITER_ENABLED, Boolean.class); - } - - public static boolean isSequenceFileNativeReaderEnabled(ConnectorSession session) - { - return session.getProperty(SEQUENCE_FILE_NATIVE_READER_ENABLED, Boolean.class); - } - - public static boolean isSequenceFileNativeWriterEnabled(ConnectorSession session) - { - return session.getProperty(SEQUENCE_FILE_NATIVE_WRITER_ENABLED, Boolean.class); - } - public static boolean isOrcBloomFiltersEnabled(ConnectorSession session) { return session.getProperty(ORC_BLOOM_FILTERS_ENABLED, Boolean.class); @@ -907,11 +751,6 @@ public static boolean isPropagateTableScanSortingProperties(ConnectorSession ses return session.getProperty(PROPAGATE_TABLE_SCAN_SORTING_PROPERTIES, Boolean.class); } - public static boolean isS3SelectPushdownEnabled(ConnectorSession session) - { - return session.getProperty(S3_SELECT_PUSHDOWN_ENABLED, Boolean.class); - } - public static boolean isStatisticsEnabled(ConnectorSession session) { return session.getProperty(STATISTICS_ENABLED, Boolean.class); @@ -979,11 +818,6 @@ public static Duration getDynamicFilteringWaitTimeout(ConnectorSession session) return session.getProperty(DYNAMIC_FILTERING_WAIT_TIMEOUT, Duration.class); } - public static boolean isOptimizeSymlinkListing(ConnectorSession session) - { - return session.getProperty(OPTIMIZE_SYMLINK_LISTING, Boolean.class); - } - public static boolean isHiveViewsLegacyTranslation(ConnectorSession session) { return session.getProperty(HIVE_VIEWS_LEGACY_TRANSLATION, Boolean.class); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplit.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplit.java index 1167039edf67a..62df401b71aa6 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplit.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplit.java @@ -55,7 +55,6 @@ public class HiveSplit private final TableToPartitionMapping tableToPartitionMapping; private final Optional bucketConversion; private final Optional bucketValidation; - private final boolean s3SelectPushdownEnabled; private final Optional acidInfo; private final SplitWeight splitWeight; @@ -76,7 +75,6 @@ public HiveSplit( @JsonProperty("tableToPartitionMapping") TableToPartitionMapping tableToPartitionMapping, @JsonProperty("bucketConversion") Optional bucketConversion, @JsonProperty("bucketValidation") Optional bucketValidation, - @JsonProperty("s3SelectPushdownEnabled") boolean s3SelectPushdownEnabled, @JsonProperty("acidInfo") Optional acidInfo, @JsonProperty("splitWeight") SplitWeight splitWeight) { @@ -110,7 +108,6 @@ public HiveSplit( this.tableToPartitionMapping = tableToPartitionMapping; this.bucketConversion = bucketConversion; this.bucketValidation = bucketValidation; - this.s3SelectPushdownEnabled = s3SelectPushdownEnabled; this.acidInfo = acidInfo; this.splitWeight = requireNonNull(splitWeight, "splitWeight is null"); } @@ -212,12 +209,6 @@ public boolean isRemotelyAccessible() return !forceLocalScheduling; } - @JsonProperty - public boolean isS3SelectPushdownEnabled() - { - return s3SelectPushdownEnabled; - } - @JsonProperty public Optional getAcidInfo() { @@ -261,7 +252,6 @@ public Object getInfo() .put("forceLocalScheduling", forceLocalScheduling) .put("partitionName", partitionName) .put("deserializerClassName", getDeserializerClassName(schema)) - .put("s3SelectPushdownEnabled", s3SelectPushdownEnabled) .buildOrThrow(); } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitManager.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitManager.java index 077e5a52540d5..6d740932793aa 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitManager.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitManager.java @@ -23,7 +23,6 @@ import io.airlift.stats.CounterStat; import io.airlift.units.DataSize; import io.trino.filesystem.TrinoFileSystemFactory; -import io.trino.hdfs.HdfsEnvironment; import io.trino.hdfs.HdfsNamenodeStats; import io.trino.plugin.hive.metastore.Column; import io.trino.plugin.hive.metastore.Partition; @@ -75,7 +74,6 @@ import static io.trino.plugin.hive.HiveSessionProperties.getDynamicFilteringWaitTimeout; import static io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision; import static io.trino.plugin.hive.HiveSessionProperties.isIgnoreAbsentPartitions; -import static io.trino.plugin.hive.HiveSessionProperties.isOptimizeSymlinkListing; import static io.trino.plugin.hive.HiveSessionProperties.isPropagateTableScanSortingProperties; import static io.trino.plugin.hive.HiveSessionProperties.isUseOrcColumnNames; import static io.trino.plugin.hive.HiveSessionProperties.isUseParquetColumnNames; @@ -105,7 +103,6 @@ public class HiveSplitManager private final HivePartitionManager partitionManager; private final TrinoFileSystemFactory fileSystemFactory; private final HdfsNamenodeStats hdfsNamenodeStats; - private final HdfsEnvironment hdfsEnvironment; private final Executor executor; private final int maxOutstandingSplits; private final DataSize maxOutstandingSplitsSize; @@ -126,7 +123,6 @@ public HiveSplitManager( HivePartitionManager partitionManager, TrinoFileSystemFactory fileSystemFactory, HdfsNamenodeStats hdfsNamenodeStats, - HdfsEnvironment hdfsEnvironment, ExecutorService executorService, VersionEmbedder versionEmbedder, TypeManager typeManager) @@ -136,7 +132,6 @@ public HiveSplitManager( partitionManager, fileSystemFactory, hdfsNamenodeStats, - hdfsEnvironment, versionEmbedder.embedVersion(new BoundedExecutor(executorService, hiveConfig.getMaxSplitIteratorThreads())), new CounterStat(), hiveConfig.getMaxOutstandingSplits(), @@ -156,7 +151,6 @@ public HiveSplitManager( HivePartitionManager partitionManager, TrinoFileSystemFactory fileSystemFactory, HdfsNamenodeStats hdfsNamenodeStats, - HdfsEnvironment hdfsEnvironment, Executor executor, CounterStat highMemorySplitSourceCounter, int maxOutstandingSplits, @@ -174,7 +168,6 @@ public HiveSplitManager( this.partitionManager = requireNonNull(partitionManager, "partitionManager is null"); this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); this.hdfsNamenodeStats = requireNonNull(hdfsNamenodeStats, "hdfsNamenodeStats is null"); - this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); this.executor = new ErrorCodedExecutor(executor); this.highMemorySplitSourceCounter = requireNonNull(highMemorySplitSourceCounter, "highMemorySplitSourceCounter is null"); checkArgument(maxOutstandingSplits >= 1, "maxOutstandingSplits must be at least 1"); @@ -259,14 +252,12 @@ public ConnectorSplitSource getSplits( createBucketSplitInfo(bucketHandle, bucketFilter), session, fileSystemFactory, - hdfsEnvironment, hdfsNamenodeStats, transactionalMetadata.getDirectoryLister(), executor, splitLoaderConcurrency, recursiveDfsWalkerEnabled, !hiveTable.getPartitionColumns().isEmpty() && isIgnoreAbsentPartitions(session), - isOptimizeSymlinkListing(session), metastore.getValidWriteIds(session, hiveTable) .map(value -> value.getTableValidWriteIdList(table.getDatabaseName() + "." + table.getTableName())), hiveTable.getMaxScannedFileSize(), diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitSource.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitSource.java index 1c8affff2a004..63f36aec02698 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitSource.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveSplitSource.java @@ -312,7 +312,6 @@ else if (maxSplitBytes * 2 >= remainingBlockBytes) { internalSplit.getTableToPartitionMapping(), internalSplit.getBucketConversion(), internalSplit.getBucketValidation(), - internalSplit.isS3SelectPushdownEnabled(), internalSplit.getAcidInfo(), splitWeightProvider.weightForSplitSizeInBytes(splitBytes))); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveStorageFormat.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveStorageFormat.java index 203212afdd427..8d6b3d93e4912 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveStorageFormat.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveStorageFormat.java @@ -14,8 +14,6 @@ package io.trino.plugin.hive; import com.google.common.collect.ImmutableMap; -import io.airlift.units.DataSize; -import io.airlift.units.DataSize.Unit; import io.trino.hive.formats.compression.CompressionKind; import io.trino.plugin.hive.metastore.StorageFormat; import io.trino.plugin.hive.type.Category; @@ -63,70 +61,57 @@ public enum HiveStorageFormat ORC( ORC_SERDE_CLASS, ORC_INPUT_FORMAT_CLASS, - ORC_OUTPUT_FORMAT_CLASS, - DataSize.of(64, Unit.MEGABYTE)), + ORC_OUTPUT_FORMAT_CLASS), PARQUET( PARQUET_HIVE_SERDE_CLASS, MAPRED_PARQUET_INPUT_FORMAT_CLASS, - MAPRED_PARQUET_OUTPUT_FORMAT_CLASS, - DataSize.of(64, Unit.MEGABYTE)), + MAPRED_PARQUET_OUTPUT_FORMAT_CLASS), AVRO( AVRO_SERDE_CLASS, AVRO_CONTAINER_INPUT_FORMAT_CLASS, - AVRO_CONTAINER_OUTPUT_FORMAT_CLASS, - DataSize.of(64, Unit.MEGABYTE)), + AVRO_CONTAINER_OUTPUT_FORMAT_CLASS), RCBINARY( LAZY_BINARY_COLUMNAR_SERDE_CLASS, RCFILE_INPUT_FORMAT_CLASS, - RCFILE_OUTPUT_FORMAT_CLASS, - DataSize.of(8, Unit.MEGABYTE)), + RCFILE_OUTPUT_FORMAT_CLASS), RCTEXT( COLUMNAR_SERDE_CLASS, RCFILE_INPUT_FORMAT_CLASS, - RCFILE_OUTPUT_FORMAT_CLASS, - DataSize.of(8, Unit.MEGABYTE)), + RCFILE_OUTPUT_FORMAT_CLASS), SEQUENCEFILE( LAZY_SIMPLE_SERDE_CLASS, SEQUENCEFILE_INPUT_FORMAT_CLASS, - HIVE_SEQUENCEFILE_OUTPUT_FORMAT_CLASS, - DataSize.of(8, Unit.MEGABYTE)), + HIVE_SEQUENCEFILE_OUTPUT_FORMAT_CLASS), JSON( JSON_SERDE_CLASS, TEXT_INPUT_FORMAT_CLASS, - HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS, - DataSize.of(8, Unit.MEGABYTE)), + HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS), OPENX_JSON( OPENX_JSON_SERDE_CLASS, TEXT_INPUT_FORMAT_CLASS, - HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS, - DataSize.of(8, Unit.MEGABYTE)), + HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS), TEXTFILE( LAZY_SIMPLE_SERDE_CLASS, TEXT_INPUT_FORMAT_CLASS, - HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS, - DataSize.of(8, Unit.MEGABYTE)), + HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS), CSV( OPENCSV_SERDE_CLASS, TEXT_INPUT_FORMAT_CLASS, - HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS, - DataSize.of(8, Unit.MEGABYTE)), + HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS), REGEX( REGEX_SERDE_CLASS, TEXT_INPUT_FORMAT_CLASS, - HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS, - DataSize.of(8, Unit.MEGABYTE)); + HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS); private final String serde; private final String inputFormat; private final String outputFormat; - private final DataSize estimatedWriterMemoryUsage; - HiveStorageFormat(String serde, String inputFormat, String outputFormat, DataSize estimatedWriterMemoryUsage) + HiveStorageFormat(String serde, String inputFormat, String outputFormat) { this.serde = requireNonNull(serde, "serde is null"); this.inputFormat = requireNonNull(inputFormat, "inputFormat is null"); this.outputFormat = requireNonNull(outputFormat, "outputFormat is null"); - this.estimatedWriterMemoryUsage = requireNonNull(estimatedWriterMemoryUsage, "estimatedWriterMemoryUsage is null"); } public String getSerde() @@ -144,11 +129,6 @@ public String getOutputFormat() return outputFormat; } - public DataSize getEstimatedWriterMemoryUsage() - { - return estimatedWriterMemoryUsage; - } - public boolean isSplittable(String path) { // Only uncompressed text input format is splittable diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveWriterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveWriterFactory.java index 0402d29933f49..4c07825cddca5 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveWriterFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveWriterFactory.java @@ -24,8 +24,7 @@ import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoFileSystemFactory; -import io.trino.hdfs.HdfsContext; -import io.trino.hdfs.HdfsEnvironment; +import io.trino.hive.formats.compression.CompressionKind; import io.trino.plugin.hive.HiveSessionProperties.InsertExistingPartitionsBehavior; import io.trino.plugin.hive.LocationService.WriteInfo; import io.trino.plugin.hive.PartitionUpdate.UpdateMode; @@ -47,14 +46,6 @@ import io.trino.spi.type.RowType; import io.trino.spi.type.Type; import io.trino.spi.type.TypeManager; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.io.compress.CompressionCodec; -import org.apache.hadoop.io.compress.DefaultCodec; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.util.ReflectionUtils; -import org.joda.time.DateTimeZone; import java.io.IOException; import java.security.Principal; @@ -78,7 +69,6 @@ import static com.google.common.collect.ImmutableMap.toImmutableMap; import static com.google.common.collect.Maps.immutableEntry; import static com.google.common.collect.MoreCollectors.onlyElement; -import static io.trino.hdfs.ConfigurationUtils.toJobConf; import static io.trino.plugin.hive.HiveCompressionCodecs.selectCompressionCodec; import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA; @@ -97,7 +87,6 @@ import static io.trino.plugin.hive.util.AcidTables.deltaSubdir; import static io.trino.plugin.hive.util.AcidTables.isFullAcidTable; import static io.trino.plugin.hive.util.AcidTables.isInsertOnlyTable; -import static io.trino.plugin.hive.util.CompressionConfigUtil.configureCompression; import static io.trino.plugin.hive.util.HiveClassNames.HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS; import static io.trino.plugin.hive.util.HiveUtil.getColumnNames; import static io.trino.plugin.hive.util.HiveUtil.getColumnTypes; @@ -116,7 +105,6 @@ import static java.util.stream.Collectors.joining; import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toMap; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.COMPRESSRESULT; public class HiveWriterFactory { @@ -147,7 +135,6 @@ public class HiveWriterFactory private final HivePageSinkMetadataProvider pageSinkMetadataProvider; private final TypeManager typeManager; private final PageSorter pageSorter; - private final JobConf conf; private final Table table; private final DataSize sortBufferSize; @@ -155,7 +142,6 @@ public class HiveWriterFactory private final boolean sortedWritingTempStagingPathEnabled; private final String sortedWritingTempStagingPath; private final InsertExistingPartitionsBehavior insertExistingPartitionsBehavior; - private final DateTimeZone parquetTimeZone; private final ConnectorSession session; private final OptionalInt bucketCount; @@ -187,11 +173,9 @@ public HiveWriterFactory( String queryId, HivePageSinkMetadataProvider pageSinkMetadataProvider, TypeManager typeManager, - HdfsEnvironment hdfsEnvironment, PageSorter pageSorter, DataSize sortBufferSize, int maxOpenSortFiles, - DateTimeZone parquetTimeZone, ConnectorSession session, NodeManager nodeManager, EventClient eventClient, @@ -223,7 +207,6 @@ public HiveWriterFactory( this.sortedWritingTempStagingPathEnabled = sortedWritingTempStagingPathEnabled; this.sortedWritingTempStagingPath = requireNonNull(sortedWritingTempStagingPath, "sortedWritingTempStagingPath is null"); this.insertExistingPartitionsBehavior = getInsertExistingPartitionsBehavior(session); - this.parquetTimeZone = requireNonNull(parquetTimeZone, "parquetTimeZone is null"); // divide input columns into partition and data columns ImmutableList.Builder partitionColumnNames = ImmutableList.builder(); @@ -256,17 +239,14 @@ public HiveWriterFactory( this.dataColumns = dataColumns.build(); this.isCreateTransactionalTable = isCreateTable && transaction.isTransactional(); - Location writePath; if (isCreateTable) { this.table = null; WriteInfo writeInfo = locationService.getQueryWriteInfo(locationHandle); checkArgument(writeInfo.writeMode() != DIRECT_TO_TARGET_EXISTING_DIRECTORY, "CREATE TABLE write mode cannot be DIRECT_TO_TARGET_EXISTING_DIRECTORY"); - writePath = writeInfo.writePath(); } else { this.table = pageSinkMetadataProvider.getTable() .orElseThrow(() -> new TrinoException(HIVE_INVALID_METADATA, format("Table '%s.%s' was dropped during insert", schemaName, tableName))); - writePath = locationService.getQueryWriteInfo(locationHandle).writePath(); } this.bucketCount = requireNonNull(bucketCount, "bucketCount is null"); @@ -289,17 +269,6 @@ public HiveWriterFactory( .filter(entry -> entry.getValue() != null) .collect(toImmutableMap(Entry::getKey, entry -> entry.getValue().toString())); - Configuration conf = hdfsEnvironment.getConfiguration(new HdfsContext(session), new Path(writePath.toString())); - this.conf = toJobConf(conf); - - // make sure the FileSystem is created with the correct Configuration object - try { - hdfsEnvironment.getFileSystem(session.getIdentity(), new Path(writePath.toString()), conf); - } - catch (IOException e) { - throw new TrinoException(HIVE_FILESYSTEM_ERROR, "Failed getting FileSystem: " + writePath, e); - } - this.hiveWriterStats = requireNonNull(hiveWriterStats, "hiveWriterStats is null"); } @@ -473,9 +442,6 @@ public HiveWriter createWriter(Page partitionColumns, int position, OptionalInt } } - JobConf outputConf = new JobConf(conf); - configureCompression(outputConf, compressionCodec); - additionalTableParameters.forEach(schema::setProperty); validateSchema(partitionName, schema); @@ -489,7 +455,7 @@ public HiveWriter createWriter(Page partitionColumns, int position, OptionalInt path = path.appendPath(subdir).appendPath(nameFormat.formatted(bucketToUse)); } else { - path = path.appendPath(computeFileName(bucketNumber) + getFileExtension(outputConf, outputStorageFormat)); + path = path.appendPath(computeFileName(bucketNumber) + getFileExtension(compressionCodec, outputStorageFormat)); } boolean useAcidSchema = isCreateTransactionalTable || (table != null && isFullAcidTable(table.getParameters())); @@ -538,18 +504,7 @@ public HiveWriter createWriter(Page partitionColumns, int position, OptionalInt } if (hiveFileWriter == null) { - hiveFileWriter = new RecordFileWriter( - new Path(path.toString()), - dataColumns.stream() - .map(DataColumn::getName) - .collect(toList()), - outputStorageFormat, - schema, - partitionStorageFormat.getEstimatedWriterMemoryUsage(), - outputConf, - typeManager, - parquetTimeZone, - session); + throw new TrinoException(HIVE_UNSUPPORTED_FORMAT, "Writing not supported for " + outputStorageFormat); } String writePath = path.toString(); @@ -782,28 +737,13 @@ public static int getBucketFromFileName(String fileName) return Integer.parseInt(matcher.group(1)); } - public static String getFileExtension(JobConf conf, StorageFormat storageFormat) + public static String getFileExtension(HiveCompressionCodec compression, StorageFormat format) { // text format files must have the correct extension when compressed - if (!HiveConf.getBoolVar(conf, COMPRESSRESULT) || !HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS.equals(storageFormat.getOutputFormat())) { - return ""; - } - - String compressionCodecClass = conf.get("mapred.output.compression.codec"); - if (compressionCodecClass == null) { - return new DefaultCodec().getDefaultExtension(); - } - - try { - Class codecClass = conf.getClassByName(compressionCodecClass).asSubclass(CompressionCodec.class); - return ReflectionUtils.newInstance(codecClass, conf).getDefaultExtension(); - } - catch (ClassNotFoundException e) { - throw new TrinoException(HIVE_UNSUPPORTED_FORMAT, "Compression codec not found: " + compressionCodecClass, e); - } - catch (RuntimeException e) { - throw new TrinoException(HIVE_UNSUPPORTED_FORMAT, "Failed to load compression codec: " + compressionCodecClass, e); - } + return compression.getHiveCompressionKind() + .filter(ignored -> format.getOutputFormat().equals(HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS)) + .map(CompressionKind::getFileExtension) + .orElse(""); } @VisibleForTesting diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/InternalHiveSplit.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/InternalHiveSplit.java index 40c7e7a52b6e8..ae6c28e0c63bd 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/InternalHiveSplit.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/InternalHiveSplit.java @@ -54,7 +54,6 @@ public class InternalHiveSplit private final TableToPartitionMapping tableToPartitionMapping; private final Optional bucketConversion; private final Optional bucketValidation; - private final boolean s3SelectPushdownEnabled; private final Optional acidInfo; private final BooleanSupplier partitionMatchSupplier; @@ -78,7 +77,6 @@ public InternalHiveSplit( TableToPartitionMapping tableToPartitionMapping, Optional bucketConversion, Optional bucketValidation, - boolean s3SelectPushdownEnabled, Optional acidInfo, BooleanSupplier partitionMatchSupplier) { @@ -114,7 +112,6 @@ public InternalHiveSplit( this.tableToPartitionMapping = tableToPartitionMapping; this.bucketConversion = bucketConversion; this.bucketValidation = bucketValidation; - this.s3SelectPushdownEnabled = s3SelectPushdownEnabled; this.acidInfo = acidInfo; this.partitionMatchSupplier = partitionMatchSupplier; } @@ -144,11 +141,6 @@ public long getFileModifiedTime() return fileModifiedTime; } - public boolean isS3SelectPushdownEnabled() - { - return s3SelectPushdownEnabled; - } - public Properties getSchema() { return schema; diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RecordFileWriter.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RecordFileWriter.java deleted file mode 100644 index 7874b0326b377..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/RecordFileWriter.java +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import io.airlift.units.DataSize; -import io.trino.plugin.hive.metastore.StorageFormat; -import io.trino.plugin.hive.parquet.ParquetRecordWriter; -import io.trino.plugin.hive.util.FieldSetterFactory; -import io.trino.plugin.hive.util.FieldSetterFactory.FieldSetter; -import io.trino.plugin.hive.util.TextHeaderWriter; -import io.trino.spi.Page; -import io.trino.spi.TrinoException; -import io.trino.spi.block.Block; -import io.trino.spi.connector.ConnectorSession; -import io.trino.spi.type.Type; -import io.trino.spi.type.TypeManager; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.Serializer; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.mapred.JobConf; -import org.joda.time.DateTimeZone; - -import java.io.Closeable; -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.List; -import java.util.Optional; -import java.util.Properties; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.collect.ImmutableList.toImmutableList; -import static io.airlift.slice.SizeOf.instanceSize; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_CLOSE_ERROR; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_DATA_ERROR; -import static io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision; -import static io.trino.plugin.hive.util.HiveClassNames.HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.HiveUtil.getColumnNames; -import static io.trino.plugin.hive.util.HiveUtil.getColumnTypes; -import static io.trino.plugin.hive.util.HiveWriteUtils.createRecordWriter; -import static io.trino.plugin.hive.util.HiveWriteUtils.getRowColumnInspectors; -import static io.trino.plugin.hive.util.HiveWriteUtils.initializeSerializer; -import static java.util.Objects.requireNonNull; -import static java.util.stream.Collectors.toList; -import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector; - -public class RecordFileWriter - implements FileWriter -{ - private static final int INSTANCE_SIZE = instanceSize(RecordFileWriter.class); - - private final Path path; - private final JobConf conf; - private final int fieldCount; - private final Serializer serializer; - private final RecordWriter recordWriter; - private final SettableStructObjectInspector tableInspector; - private final List structFields; - private final Object row; - private final FieldSetter[] setters; - private final long estimatedWriterMemoryUsage; - - private boolean committed; - private long finalWrittenBytes = -1; - - public RecordFileWriter( - Path path, - List inputColumnNames, - StorageFormat storageFormat, - Properties schema, - DataSize estimatedWriterMemoryUsage, - JobConf conf, - TypeManager typeManager, - DateTimeZone parquetTimeZone, - ConnectorSession session) - { - this.path = requireNonNull(path, "path is null"); - this.conf = requireNonNull(conf, "conf is null"); - - // existing tables may have columns in a different order - List fileColumnNames = getColumnNames(schema); - List fileColumnTypes = getColumnTypes(schema).stream() - .map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))) - .collect(toList()); - - fieldCount = fileColumnNames.size(); - - String serde = storageFormat.getSerde(); - serializer = initializeSerializer(conf, schema, serde); - - List objectInspectors = getRowColumnInspectors(fileColumnTypes); - tableInspector = getStandardStructObjectInspector(fileColumnNames, objectInspectors); - - if (storageFormat.getOutputFormat().equals(HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS)) { - Optional textHeaderWriter = Optional.of(new TextHeaderWriter(serializer, typeManager, session, fileColumnNames)); - recordWriter = createRecordWriter(path, conf, schema, storageFormat.getOutputFormat(), session, textHeaderWriter); - } - else { - recordWriter = createRecordWriter(path, conf, schema, storageFormat.getOutputFormat(), session, Optional.empty()); - } - - // reorder (and possibly reduce) struct fields to match input - structFields = inputColumnNames.stream() - .map(tableInspector::getStructFieldRef) - .collect(toImmutableList()); - - row = tableInspector.create(); - - DateTimeZone timeZone = (recordWriter instanceof ParquetRecordWriter) ? parquetTimeZone : DateTimeZone.UTC; - FieldSetterFactory fieldSetterFactory = new FieldSetterFactory(timeZone); - - setters = new FieldSetter[structFields.size()]; - for (int i = 0; i < setters.length; i++) { - setters[i] = fieldSetterFactory.create(tableInspector, row, structFields.get(i), fileColumnTypes.get(structFields.get(i).getFieldID())); - } - - this.estimatedWriterMemoryUsage = estimatedWriterMemoryUsage.toBytes(); - } - - @Override - public long getWrittenBytes() - { - if (recordWriter instanceof ExtendedRecordWriter) { - return ((ExtendedRecordWriter) recordWriter).getWrittenBytes(); - } - - if (committed) { - if (finalWrittenBytes != -1) { - return finalWrittenBytes; - } - - try { - finalWrittenBytes = path.getFileSystem(conf).getFileStatus(path).getLen(); - return finalWrittenBytes; - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - // there is no good way to get this when RecordWriter is not yet committed - return 0; - } - - @Override - public long getMemoryUsage() - { - return INSTANCE_SIZE + estimatedWriterMemoryUsage; - } - - @Override - public void appendRows(Page dataPage) - { - for (int position = 0; position < dataPage.getPositionCount(); position++) { - appendRow(dataPage, position); - } - } - - public void appendRow(Page dataPage, int position) - { - for (int field = 0; field < fieldCount; field++) { - Block block = dataPage.getBlock(field); - if (block.isNull(position)) { - tableInspector.setStructFieldData(row, structFields.get(field), null); - } - else { - setters[field].setField(block, position); - } - } - - try { - recordWriter.write(serializer.serialize(row, tableInspector)); - } - catch (SerDeException | IOException e) { - throw new TrinoException(HIVE_WRITER_DATA_ERROR, e); - } - } - - @Override - public Closeable commit() - { - try { - recordWriter.close(false); - committed = true; - } - catch (IOException e) { - throw new TrinoException(HIVE_WRITER_CLOSE_ERROR, "Error committing write to Hive", e); - } - - return createRollbackAction(path, conf); - } - - @Override - public void rollback() - { - Closeable rollbackAction = createRollbackAction(path, conf); - try (rollbackAction) { - recordWriter.close(true); - } - catch (IOException e) { - throw new TrinoException(HIVE_WRITER_CLOSE_ERROR, "Error rolling back write to Hive", e); - } - } - - private static Closeable createRollbackAction(Path path, JobConf conf) - { - return () -> path.getFileSystem(conf).delete(path, false); - } - - @Override - public long getValidationCpuNanos() - { - // RecordFileWriter delegates to Hive RecordWriter and there is no validation - return 0; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("path", path) - .toString(); - } - - public interface ExtendedRecordWriter - extends RecordWriter - { - long getWrittenBytes(); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroFileWriterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroFileWriterFactory.java index 8b3d420862fe8..9df3eb8aa98c0 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroFileWriterFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroFileWriterFactory.java @@ -49,7 +49,6 @@ import static io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME; import static io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME; import static io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision; -import static io.trino.plugin.hive.HiveSessionProperties.isAvroNativeWriterEnabled; import static io.trino.plugin.hive.util.HiveClassNames.AVRO_CONTAINER_OUTPUT_FORMAT_CLASS; import static io.trino.plugin.hive.util.HiveUtil.getColumnNames; import static io.trino.plugin.hive.util.HiveUtil.getColumnTypes; @@ -86,9 +85,6 @@ public Optional createFileWriter( boolean useAcidSchema, WriterKind writerKind) { - if (!isAvroNativeWriterEnabled(session)) { - return Optional.empty(); - } if (!AVRO_CONTAINER_OUTPUT_FORMAT_CLASS.equals(storageFormat.getOutputFormat())) { return Optional.empty(); } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroPageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroPageSourceFactory.java index 78e6acf03bac3..2aaa969e9774c 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroPageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroPageSourceFactory.java @@ -56,7 +56,6 @@ import static io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; import static io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns; import static io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision; -import static io.trino.plugin.hive.HiveSessionProperties.isAvroNativeReaderEnabled; import static io.trino.plugin.hive.ReaderPageSource.noProjectionAdaptation; import static io.trino.plugin.hive.avro.AvroHiveFileUtils.wrapInUnionWithNull; import static io.trino.plugin.hive.util.HiveClassNames.AVRO_SERDE_CLASS; @@ -93,10 +92,7 @@ public Optional createPageSource( boolean originalFile, AcidTransaction transaction) { - if (!isAvroNativeReaderEnabled(session)) { - return Optional.empty(); - } - else if (!AVRO_SERDE_CLASS.equals(getDeserializerClassName(schema))) { + if (!AVRO_SERDE_CLASS.equals(getDeserializerClassName(schema))) { return Optional.empty(); } checkArgument(acidInfo.isEmpty(), "Acid is not supported"); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroRecordWriter.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroRecordWriter.java deleted file mode 100644 index e62be19b2d9e0..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/AvroRecordWriter.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.avro; - -import io.trino.plugin.hive.RecordFileWriter.ExtendedRecordWriter; -import org.apache.avro.Schema; -import org.apache.avro.file.CodecFactory; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; -import org.apache.hadoop.hive.ql.io.avro.AvroGenericRecordWriter; -import org.apache.hadoop.hive.serde2.avro.AvroSerdeException; -import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.JobConf; - -import java.io.IOException; -import java.util.Properties; - -import static org.apache.avro.file.CodecFactory.DEFAULT_DEFLATE_LEVEL; -import static org.apache.avro.file.DataFileConstants.DEFLATE_CODEC; -import static org.apache.avro.mapred.AvroJob.OUTPUT_CODEC; -import static org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY; - -public class AvroRecordWriter - implements ExtendedRecordWriter -{ - private final RecordWriter delegate; - private final FSDataOutputStream outputStream; - - public AvroRecordWriter(Path path, JobConf jobConf, boolean isCompressed, Properties properties) - throws IOException - { - Schema schema; - try { - schema = AvroSerdeUtils.determineSchemaOrThrowException(jobConf, properties); - } - catch (AvroSerdeException e) { - throw new IOException(e); - } - GenericDatumWriter genericDatumWriter = new GenericDatumWriter<>(schema); - DataFileWriter dataFileWriter = new DataFileWriter<>(genericDatumWriter); - - if (isCompressed) { - int level = jobConf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL); - String codecName = jobConf.get(OUTPUT_CODEC, DEFLATE_CODEC); - CodecFactory factory = codecName.equals(DEFLATE_CODEC) - ? CodecFactory.deflateCodec(level) - : CodecFactory.fromString(codecName); - dataFileWriter.setCodec(factory); - } - - outputStream = path.getFileSystem(jobConf).create(path); - dataFileWriter.create(schema, outputStream); - delegate = new AvroGenericRecordWriter(dataFileWriter); - } - - @Override - public long getWrittenBytes() - { - return outputStream.getPos(); - } - - @Override - public void write(Writable writable) - throws IOException - { - delegate.write(writable); - } - - @Override - public void close(boolean abort) - throws IOException - { - delegate.close(abort); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/CsvFileWriterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/CsvFileWriterFactory.java index 94d500376de19..3453ec78827a8 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/CsvFileWriterFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/CsvFileWriterFactory.java @@ -17,7 +17,6 @@ import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.hive.formats.line.csv.CsvSerializerFactory; import io.trino.hive.formats.line.text.TextLineWriterFactory; -import io.trino.plugin.hive.HiveSessionProperties; import io.trino.spi.type.TypeManager; public class CsvFileWriterFactory @@ -30,7 +29,6 @@ public CsvFileWriterFactory(TrinoFileSystemFactory trinoFileSystemFactory, TypeM typeManager, new CsvSerializerFactory(), new TextLineWriterFactory(), - HiveSessionProperties::isCsvNativeWriterEnabled, true); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/CsvPageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/CsvPageSourceFactory.java index 39ed039532992..1990dc670c47b 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/CsvPageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/CsvPageSourceFactory.java @@ -18,7 +18,6 @@ import io.trino.hive.formats.line.csv.CsvDeserializerFactory; import io.trino.hive.formats.line.text.TextLineReaderFactory; import io.trino.plugin.hive.HiveConfig; -import io.trino.plugin.hive.HiveSessionProperties; import static java.lang.Math.toIntExact; @@ -30,7 +29,6 @@ public CsvPageSourceFactory(TrinoFileSystemFactory trinoFileSystemFactory, HiveC { super(trinoFileSystemFactory, new CsvDeserializerFactory(), - new TextLineReaderFactory(1024, 1024, toIntExact(config.getTextMaxLineLength().toBytes())), - HiveSessionProperties::isCsvNativeReaderEnabled); + new TextLineReaderFactory(1024, 1024, toIntExact(config.getTextMaxLineLength().toBytes()))); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/JsonFileWriterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/JsonFileWriterFactory.java index 77b32379c6d4f..c5bfdb309afd3 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/JsonFileWriterFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/JsonFileWriterFactory.java @@ -17,7 +17,6 @@ import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.hive.formats.line.json.JsonSerializerFactory; import io.trino.hive.formats.line.text.TextLineWriterFactory; -import io.trino.plugin.hive.HiveSessionProperties; import io.trino.spi.type.TypeManager; public class JsonFileWriterFactory @@ -30,7 +29,6 @@ public JsonFileWriterFactory(TrinoFileSystemFactory trinoFileSystemFactory, Type typeManager, new JsonSerializerFactory(), new TextLineWriterFactory(), - HiveSessionProperties::isJsonNativeWriterEnabled, false); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/JsonPageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/JsonPageSourceFactory.java index 1894518708463..7f9e794ab1abd 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/JsonPageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/JsonPageSourceFactory.java @@ -18,7 +18,6 @@ import io.trino.hive.formats.line.json.JsonDeserializerFactory; import io.trino.hive.formats.line.text.TextLineReaderFactory; import io.trino.plugin.hive.HiveConfig; -import io.trino.plugin.hive.HiveSessionProperties; import static java.lang.Math.toIntExact; @@ -30,7 +29,6 @@ public JsonPageSourceFactory(TrinoFileSystemFactory trinoFileSystemFactory, Hive { super(trinoFileSystemFactory, new JsonDeserializerFactory(), - new TextLineReaderFactory(1024, 1024, toIntExact(config.getTextMaxLineLength().toBytes())), - HiveSessionProperties::isJsonNativeReaderEnabled); + new TextLineReaderFactory(1024, 1024, toIntExact(config.getTextMaxLineLength().toBytes()))); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/LineFileWriterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/LineFileWriterFactory.java index 68abafb1891c3..6015b6dbc5cb7 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/LineFileWriterFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/LineFileWriterFactory.java @@ -44,7 +44,6 @@ import java.util.Optional; import java.util.OptionalInt; import java.util.Properties; -import java.util.function.Predicate; import java.util.stream.IntStream; import static com.google.common.collect.ImmutableList.toImmutableList; @@ -64,7 +63,6 @@ public abstract class LineFileWriterFactory { private final TrinoFileSystemFactory fileSystemFactory; private final TypeManager typeManager; - private final Predicate activation; private final LineSerializerFactory lineSerializerFactory; private final LineWriterFactory lineWriterFactory; private final boolean headerSupported; @@ -74,12 +72,10 @@ protected LineFileWriterFactory( TypeManager typeManager, LineSerializerFactory lineSerializerFactory, LineWriterFactory lineWriterFactory, - Predicate activation, boolean headerSupported) { this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); this.typeManager = requireNonNull(typeManager, "typeManager is null"); - this.activation = requireNonNull(activation, "activation is null"); this.lineSerializerFactory = requireNonNull(lineSerializerFactory, "lineSerializerFactory is null"); this.lineWriterFactory = requireNonNull(lineWriterFactory, "lineWriterFactory is null"); this.headerSupported = headerSupported; @@ -99,8 +95,7 @@ public Optional createFileWriter( WriterKind writerKind) { if (!lineWriterFactory.getHiveOutputFormatClassName().equals(storageFormat.getOutputFormat()) || - !lineSerializerFactory.getHiveSerDeClassNames().contains(storageFormat.getSerde()) || - !activation.test(session)) { + !lineSerializerFactory.getHiveSerDeClassNames().contains(storageFormat.getSerde())) { return Optional.empty(); } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/LinePageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/LinePageSourceFactory.java index c92774341170c..8126015781782 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/LinePageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/LinePageSourceFactory.java @@ -43,7 +43,6 @@ import java.util.Optional; import java.util.OptionalInt; import java.util.Properties; -import java.util.function.Predicate; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableList.toImmutableList; @@ -67,17 +66,14 @@ public abstract class LinePageSourceFactory private final TrinoFileSystemFactory fileSystemFactory; private final LineDeserializerFactory lineDeserializerFactory; private final LineReaderFactory lineReaderFactory; - private final Predicate activation; protected LinePageSourceFactory( TrinoFileSystemFactory fileSystemFactory, LineDeserializerFactory lineDeserializerFactory, - LineReaderFactory lineReaderFactory, - Predicate activation) + LineReaderFactory lineReaderFactory) { this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); this.lineDeserializerFactory = requireNonNull(lineDeserializerFactory, "lineDeserializerFactory is null"); - this.activation = requireNonNull(activation, "activation is null"); this.lineReaderFactory = requireNonNull(lineReaderFactory, "lineReaderFactory is null"); } @@ -97,8 +93,7 @@ public Optional createPageSource( AcidTransaction transaction) { if (!lineReaderFactory.getHiveOutputFormatClassName().equals(schema.getProperty(FILE_INPUT_FORMAT)) || - !lineDeserializerFactory.getHiveSerDeClassNames().contains(getDeserializerClassName(schema)) || - !activation.test(session)) { + !lineDeserializerFactory.getHiveSerDeClassNames().contains(getDeserializerClassName(schema))) { return Optional.empty(); } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/OpenXJsonFileWriterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/OpenXJsonFileWriterFactory.java index 68291237bd1ba..e4097455d475b 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/OpenXJsonFileWriterFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/OpenXJsonFileWriterFactory.java @@ -17,7 +17,6 @@ import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.hive.formats.line.openxjson.OpenXJsonSerializerFactory; import io.trino.hive.formats.line.text.TextLineWriterFactory; -import io.trino.plugin.hive.HiveSessionProperties; import io.trino.spi.type.TypeManager; public class OpenXJsonFileWriterFactory @@ -30,7 +29,6 @@ public OpenXJsonFileWriterFactory(TrinoFileSystemFactory trinoFileSystemFactory, typeManager, new OpenXJsonSerializerFactory(), new TextLineWriterFactory(), - HiveSessionProperties::isOpenXJsonNativeWriterEnabled, true); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/OpenXJsonPageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/OpenXJsonPageSourceFactory.java index 9c9a972ba88d4..1598ef6f09427 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/OpenXJsonPageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/OpenXJsonPageSourceFactory.java @@ -18,7 +18,6 @@ import io.trino.hive.formats.line.openxjson.OpenXJsonDeserializerFactory; import io.trino.hive.formats.line.text.TextLineReaderFactory; import io.trino.plugin.hive.HiveConfig; -import io.trino.plugin.hive.HiveSessionProperties; import static java.lang.Math.toIntExact; @@ -30,7 +29,6 @@ public OpenXJsonPageSourceFactory(TrinoFileSystemFactory trinoFileSystemFactory, { super(trinoFileSystemFactory, new OpenXJsonDeserializerFactory(), - new TextLineReaderFactory(1024, 1024, toIntExact(config.getTextMaxLineLength().toBytes())), - HiveSessionProperties::isOpenXJsonNativeReaderEnabled); + new TextLineReaderFactory(1024, 1024, toIntExact(config.getTextMaxLineLength().toBytes()))); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/RegexPageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/RegexPageSourceFactory.java index c04dfc6f80ea4..5af670e37210c 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/RegexPageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/RegexPageSourceFactory.java @@ -18,7 +18,6 @@ import io.trino.hive.formats.line.regex.RegexDeserializerFactory; import io.trino.hive.formats.line.text.TextLineReaderFactory; import io.trino.plugin.hive.HiveConfig; -import io.trino.plugin.hive.HiveSessionProperties; import static java.lang.Math.toIntExact; @@ -30,7 +29,6 @@ public RegexPageSourceFactory(TrinoFileSystemFactory trinoFileSystemFactory, Hiv { super(trinoFileSystemFactory, new RegexDeserializerFactory(), - new TextLineReaderFactory(1024, 1024, toIntExact(config.getTextMaxLineLength().toBytes())), - HiveSessionProperties::isRegexNativeReaderEnabled); + new TextLineReaderFactory(1024, 1024, toIntExact(config.getTextMaxLineLength().toBytes()))); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleSequenceFilePageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleSequenceFilePageSourceFactory.java index 1669266d19661..f77a91b408b8b 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleSequenceFilePageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleSequenceFilePageSourceFactory.java @@ -18,7 +18,6 @@ import io.trino.hive.formats.line.sequence.SequenceFileReaderFactory; import io.trino.hive.formats.line.simple.SimpleDeserializerFactory; import io.trino.plugin.hive.HiveConfig; -import io.trino.plugin.hive.HiveSessionProperties; import static java.lang.Math.toIntExact; @@ -30,7 +29,6 @@ public SimpleSequenceFilePageSourceFactory(TrinoFileSystemFactory trinoFileSyste { super(trinoFileSystemFactory, new SimpleDeserializerFactory(), - new SequenceFileReaderFactory(1024, toIntExact(config.getTextMaxLineLength().toBytes())), - HiveSessionProperties::isSequenceFileNativeReaderEnabled); + new SequenceFileReaderFactory(1024, toIntExact(config.getTextMaxLineLength().toBytes()))); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleSequenceFileWriterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleSequenceFileWriterFactory.java index 77bc24f43a31c..291a3c4c0edf1 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleSequenceFileWriterFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleSequenceFileWriterFactory.java @@ -17,7 +17,6 @@ import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.hive.formats.line.sequence.SequenceFileWriterFactory; import io.trino.hive.formats.line.simple.SimpleSerializerFactory; -import io.trino.plugin.hive.HiveSessionProperties; import io.trino.plugin.hive.NodeVersion; import io.trino.spi.type.TypeManager; @@ -31,7 +30,6 @@ public SimpleSequenceFileWriterFactory(TrinoFileSystemFactory trinoFileSystemFac typeManager, new SimpleSerializerFactory(), new SequenceFileWriterFactory(nodeVersion.toString()), - HiveSessionProperties::isSequenceFileNativeWriterEnabled, false); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleTextFilePageSourceFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleTextFilePageSourceFactory.java index 5f96f1cc48058..e283b9668c8d9 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleTextFilePageSourceFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleTextFilePageSourceFactory.java @@ -18,7 +18,6 @@ import io.trino.hive.formats.line.simple.SimpleDeserializerFactory; import io.trino.hive.formats.line.text.TextLineReaderFactory; import io.trino.plugin.hive.HiveConfig; -import io.trino.plugin.hive.HiveSessionProperties; import static java.lang.Math.toIntExact; @@ -30,7 +29,6 @@ public SimpleTextFilePageSourceFactory(TrinoFileSystemFactory trinoFileSystemFac { super(trinoFileSystemFactory, new SimpleDeserializerFactory(), - new TextLineReaderFactory(1024, 1024, toIntExact(config.getTextMaxLineLength().toBytes())), - HiveSessionProperties::isTextFileNativeReaderEnabled); + new TextLineReaderFactory(1024, 1024, toIntExact(config.getTextMaxLineLength().toBytes()))); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleTextFileWriterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleTextFileWriterFactory.java index 68ebad92ef85b..523eb8a35246f 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleTextFileWriterFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/line/SimpleTextFileWriterFactory.java @@ -17,7 +17,6 @@ import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.hive.formats.line.simple.SimpleSerializerFactory; import io.trino.hive.formats.line.text.TextLineWriterFactory; -import io.trino.plugin.hive.HiveSessionProperties; import io.trino.spi.type.TypeManager; public class SimpleTextFileWriterFactory @@ -30,7 +29,6 @@ public SimpleTextFileWriterFactory(TrinoFileSystemFactory trinoFileSystemFactory typeManager, new SimpleSerializerFactory(), new TextLineWriterFactory(), - HiveSessionProperties::isTextFileNativeWriterEnabled, false); } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetRecordWriter.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetRecordWriter.java deleted file mode 100644 index bedff16e753ee..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetRecordWriter.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.parquet; - -import io.trino.plugin.hive.RecordFileWriter.ExtendedRecordWriter; -import io.trino.spi.connector.ConnectorSession; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; -import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; -import org.apache.hadoop.hive.ql.io.parquet.write.ParquetRecordWriterWrapper; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Reporter; -import org.apache.parquet.hadoop.DisabledMemoryManager; -import org.apache.parquet.hadoop.ParquetFileWriter; -import org.apache.parquet.hadoop.ParquetOutputFormat; - -import java.io.IOException; -import java.lang.reflect.Field; -import java.util.Properties; - -import static io.trino.plugin.hive.HiveSessionProperties.getParquetWriterBlockSize; -import static io.trino.plugin.hive.HiveSessionProperties.getParquetWriterPageSize; -import static java.util.Objects.requireNonNull; - -public final class ParquetRecordWriter - implements ExtendedRecordWriter -{ - private static final Field REAL_WRITER_FIELD; - private static final Field INTERNAL_WRITER_FIELD; - private static final Field FILE_WRITER_FIELD; - - static { - try { - REAL_WRITER_FIELD = ParquetRecordWriterWrapper.class.getDeclaredField("realWriter"); - INTERNAL_WRITER_FIELD = org.apache.parquet.hadoop.ParquetRecordWriter.class.getDeclaredField("internalWriter"); - FILE_WRITER_FIELD = INTERNAL_WRITER_FIELD.getType().getDeclaredField("parquetFileWriter"); - - REAL_WRITER_FIELD.setAccessible(true); - INTERNAL_WRITER_FIELD.setAccessible(true); - FILE_WRITER_FIELD.setAccessible(true); - - replaceHadoopParquetMemoryManager(); - } - catch (ReflectiveOperationException e) { - throw new AssertionError(e); - } - } - - public static RecordWriter create(Path target, JobConf conf, Properties properties, ConnectorSession session) - throws IOException, ReflectiveOperationException - { - conf.setLong(ParquetOutputFormat.BLOCK_SIZE, getParquetWriterBlockSize(session).toBytes()); - conf.setLong(ParquetOutputFormat.PAGE_SIZE, getParquetWriterPageSize(session).toBytes()); - - RecordWriter recordWriter = new MapredParquetOutputFormat() - .getHiveRecordWriter(conf, target, Text.class, false, properties, Reporter.NULL); - - Object realWriter = REAL_WRITER_FIELD.get(recordWriter); - Object internalWriter = INTERNAL_WRITER_FIELD.get(realWriter); - ParquetFileWriter fileWriter = (ParquetFileWriter) FILE_WRITER_FIELD.get(internalWriter); - - return new ParquetRecordWriter(recordWriter, fileWriter); - } - - public static void replaceHadoopParquetMemoryManager() - { - try { - Field memoryManager = org.apache.parquet.hadoop.ParquetOutputFormat.class.getDeclaredField("memoryManager"); - memoryManager.setAccessible(true); - memoryManager.set(null, new DisabledMemoryManager()); - } - catch (ReflectiveOperationException e) { - throw new AssertionError(e); - } - } - - private final RecordWriter recordWriter; - private final ParquetFileWriter fileWriter; - private long length; - - private ParquetRecordWriter(RecordWriter recordWriter, ParquetFileWriter fileWriter) - { - this.recordWriter = requireNonNull(recordWriter, "recordWriter is null"); - this.fileWriter = requireNonNull(fileWriter, "fileWriter is null"); - } - - @Override - public long getWrittenBytes() - { - return length; - } - - @Override - public void write(Writable value) - throws IOException - { - recordWriter.write(value); - length = fileWriter.getPos(); - } - - @Override - public void close(boolean abort) - throws IOException - { - recordWriter.close(abort); - if (!abort) { - length = fileWriter.getPos(); - } - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/procedure/SyncPartitionMetadataProcedure.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/procedure/SyncPartitionMetadataProcedure.java index f25b56bd99179..3be1d4d10d3bb 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/procedure/SyncPartitionMetadataProcedure.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/procedure/SyncPartitionMetadataProcedure.java @@ -15,13 +15,12 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; +import com.google.common.collect.ImmutableSet; import com.google.inject.Inject; import com.google.inject.Provider; import io.trino.filesystem.Location; -import io.trino.hdfs.HdfsContext; -import io.trino.hdfs.HdfsEnvironment; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.plugin.hive.PartitionStatistics; import io.trino.plugin.hive.TransactionalMetadataFactory; import io.trino.plugin.hive.metastore.Column; @@ -36,19 +35,15 @@ import io.trino.spi.connector.TableNotFoundException; import io.trino.spi.procedure.Procedure; import io.trino.spi.procedure.Procedure.Argument; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import java.io.IOException; import java.lang.invoke.MethodHandle; -import java.util.HashSet; import java.util.List; import java.util.Optional; import java.util.Set; -import java.util.stream.Stream; -import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.Sets.difference; import static io.trino.plugin.base.util.Procedures.checkProcedureArgument; import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; import static io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME; @@ -57,6 +52,7 @@ import static io.trino.spi.type.BooleanType.BOOLEAN; import static io.trino.spi.type.VarcharType.VARCHAR; import static java.lang.Boolean.TRUE; +import static java.lang.String.join; import static java.lang.invoke.MethodHandles.lookup; import static java.util.Locale.ENGLISH; import static java.util.Objects.requireNonNull; @@ -69,8 +65,6 @@ public enum SyncMode ADD, DROP, FULL } - private static final int BATCH_GET_PARTITIONS_BY_NAMES_MAX_PAGE_SIZE = 1000; - private static final MethodHandle SYNC_PARTITION_METADATA; static { @@ -83,15 +77,15 @@ public enum SyncMode } private final TransactionalMetadataFactory hiveMetadataFactory; - private final HdfsEnvironment hdfsEnvironment; + private final TrinoFileSystemFactory fileSystemFactory; @Inject public SyncPartitionMetadataProcedure( TransactionalMetadataFactory hiveMetadataFactory, - HdfsEnvironment hdfsEnvironment) + TrinoFileSystemFactory fileSystemFactory) { this.hiveMetadataFactory = requireNonNull(hiveMetadataFactory, "hiveMetadataFactory is null"); - this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); } @Override @@ -122,7 +116,6 @@ private void doSyncPartitionMetadata(ConnectorSession session, ConnectorAccessCo checkProcedureArgument(mode != null, "mode cannot be null"); SyncMode syncMode = toSyncMode(mode); - HdfsContext hdfsContext = new HdfsContext(session); SemiTransactionalHiveMetastore metastore = hiveMetadataFactory.create(session.getIdentity(), true).getMetastore(); SchemaTableName schemaTableName = new SchemaTableName(schemaName, tableName); @@ -139,77 +132,74 @@ private void doSyncPartitionMetadata(ConnectorSession session, ConnectorAccessCo accessControl.checkCanDeleteFromTable(null, new SchemaTableName(schemaName, tableName)); } - Path tableLocation = new Path(table.getStorage().getLocation()); + Location tableLocation = Location.of(table.getStorage().getLocation()); - Set partitionsToAdd; - Set partitionsToDrop; + Set partitionsInMetastore = metastore.getPartitionNames(schemaName, tableName) + .map(ImmutableSet::copyOf) + .orElseThrow(() -> new TableNotFoundException(schemaTableName)); + Set partitionsInFileSystem = listPartitions(fileSystemFactory.create(session), tableLocation, table.getPartitionColumns(), caseSensitive); - try { - FileSystem fileSystem = hdfsEnvironment.getFileSystem(hdfsContext, tableLocation); - List partitionsNamesInMetastore = metastore.getPartitionNames(schemaName, tableName) - .orElseThrow(() -> new TableNotFoundException(schemaTableName)); - List partitionsInMetastore = getPartitionsInMetastore(schemaTableName, tableLocation, partitionsNamesInMetastore, metastore); - List partitionsInFileSystem = listDirectory(fileSystem, fileSystem.getFileStatus(tableLocation), table.getPartitionColumns(), table.getPartitionColumns().size(), caseSensitive).stream() - .map(fileStatus -> fileStatus.getPath().toUri()) - .map(uri -> tableLocation.toUri().relativize(uri).getPath()) - .collect(toImmutableList()); + // partitions in file system but not in metastore + Set partitionsToAdd = difference(partitionsInFileSystem, partitionsInMetastore); - // partitions in file system but not in metastore - partitionsToAdd = difference(partitionsInFileSystem, partitionsInMetastore); - // partitions in metastore but not in file system - partitionsToDrop = difference(partitionsInMetastore, partitionsInFileSystem); - } - catch (IOException e) { - throw new TrinoException(HIVE_FILESYSTEM_ERROR, e); - } + // partitions in metastore but not in file system + Set partitionsToDrop = difference(partitionsInMetastore, partitionsInFileSystem); syncPartitions(partitionsToAdd, partitionsToDrop, syncMode, metastore, session, table); } - private List getPartitionsInMetastore(SchemaTableName schemaTableName, Path tableLocation, List partitionsNames, SemiTransactionalHiveMetastore metastore) + private static Set listPartitions(TrinoFileSystem fileSystem, Location directory, List partitionColumns, boolean caseSensitive) { - ImmutableList.Builder partitionsInMetastoreBuilder = ImmutableList.builderWithExpectedSize(partitionsNames.size()); - for (List partitionsNamesBatch : Lists.partition(partitionsNames, BATCH_GET_PARTITIONS_BY_NAMES_MAX_PAGE_SIZE)) { - metastore.getPartitionsByNames(schemaTableName.getSchemaName(), schemaTableName.getTableName(), partitionsNamesBatch).values().stream() - .filter(Optional::isPresent).map(Optional::get) - .map(partition -> new Path(partition.getStorage().getLocation()).toUri()) - .map(uri -> tableLocation.toUri().relativize(uri).getPath()) - .forEach(partitionsInMetastoreBuilder::add); - } - return partitionsInMetastoreBuilder.build(); + return doListPartitions(fileSystem, directory, partitionColumns, partitionColumns.size(), caseSensitive, ImmutableList.of()); } - private static List listDirectory(FileSystem fileSystem, FileStatus current, List partitionColumns, int depth, boolean caseSensitive) + private static Set doListPartitions(TrinoFileSystem fileSystem, Location directory, List partitionColumns, int depth, boolean caseSensitive, List partitions) { if (depth == 0) { - return ImmutableList.of(current); + return ImmutableSet.of(join("/", partitions)); } + ImmutableSet.Builder result = ImmutableSet.builder(); + for (Location location : listDirectories(fileSystem, directory)) { + String path = listedDirectoryName(directory, location); + Column column = partitionColumns.get(partitionColumns.size() - depth); + if (!isValidPartitionPath(path, column, caseSensitive)) { + continue; + } + List current = ImmutableList.builder().addAll(partitions).add(path).build(); + result.addAll(doListPartitions(fileSystem, location, partitionColumns, depth - 1, caseSensitive, current)); + } + return result.build(); + } + + private static Set listDirectories(TrinoFileSystem fileSystem, Location directory) + { try { - return Stream.of(fileSystem.listStatus(current.getPath())) - .filter(fileStatus -> isValidPartitionPath(fileStatus, partitionColumns.get(partitionColumns.size() - depth), caseSensitive)) - .flatMap(directory -> listDirectory(fileSystem, directory, partitionColumns, depth - 1, caseSensitive).stream()) - .collect(toImmutableList()); + return fileSystem.listDirectories(directory); } catch (IOException e) { throw new TrinoException(HIVE_FILESYSTEM_ERROR, e); } } - private static boolean isValidPartitionPath(FileStatus file, Column column, boolean caseSensitive) + private static String listedDirectoryName(Location directory, Location location) { - String path = file.getPath().getName(); - if (!caseSensitive) { - path = path.toLowerCase(ENGLISH); + String prefix = directory.path(); + if (!prefix.endsWith("/")) { + prefix += "/"; } - String prefix = column.getName() + '='; - return file.isDirectory() && path.startsWith(prefix); + String path = location.path(); + verify(path.endsWith("/"), "path does not end with slash: %s", location); + verify(path.startsWith(prefix), "path [%s] is not a child of directory [%s]", location, directory); + return path.substring(prefix.length(), path.length() - 1); } - // calculate relative complement of set b with respect to set a - private static Set difference(List a, List b) + private static boolean isValidPartitionPath(String path, Column column, boolean caseSensitive) { - return Sets.difference(new HashSet<>(a), new HashSet<>(b)); + if (!caseSensitive) { + path = path.toLowerCase(ENGLISH); + } + return path.startsWith(column.getName() + '='); } private static void syncPartitions( @@ -274,7 +264,7 @@ private static Partition buildPartitionObject(ConnectorSession session, Table ta .setParameters(ImmutableMap.of(PRESTO_QUERY_ID_NAME, session.getQueryId())) .withStorage(storage -> storage .setStorageFormat(table.getStorage().getStorageFormat()) - .setLocation(new Path(table.getStorage().getLocation(), partitionName).toString()) + .setLocation(Location.of(table.getStorage().getLocation()).appendPath(partitionName).toString()) .setBucketProperty(table.getStorage().getBucketProperty()) .setSerdeParameters(table.getStorage().getSerdeParameters())) .build(); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/IonSqlQueryBuilder.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/IonSqlQueryBuilder.java deleted file mode 100644 index 53a1ca72288b7..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/IonSqlQueryBuilder.java +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.google.common.base.Joiner; -import com.google.common.collect.ImmutableList; -import com.google.common.primitives.Shorts; -import com.google.common.primitives.SignedBytes; -import io.airlift.slice.Slice; -import io.trino.plugin.hive.HiveColumnHandle; -import io.trino.spi.predicate.Domain; -import io.trino.spi.predicate.Range; -import io.trino.spi.predicate.TupleDomain; -import io.trino.spi.type.Type; -import io.trino.spi.type.TypeManager; -import io.trino.spi.type.VarcharType; -import org.joda.time.format.DateTimeFormatter; - -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkState; -import static com.google.common.collect.Iterables.getOnlyElement; -import static io.trino.plugin.hive.s3select.S3SelectDataType.CSV; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.spi.type.BooleanType.BOOLEAN; -import static io.trino.spi.type.DateType.DATE; -import static io.trino.spi.type.IntegerType.INTEGER; -import static io.trino.spi.type.SmallintType.SMALLINT; -import static io.trino.spi.type.TinyintType.TINYINT; -import static java.lang.Math.toIntExact; -import static java.lang.String.format; -import static java.util.Objects.requireNonNull; -import static java.util.concurrent.TimeUnit.DAYS; -import static java.util.stream.Collectors.joining; -import static org.joda.time.chrono.ISOChronology.getInstanceUTC; -import static org.joda.time.format.ISODateTimeFormat.date; - -/** - * S3 Select uses Ion SQL++ query language. This class is used to construct a valid Ion SQL++ query - * to be evaluated with S3 Select on an S3 object. - */ -public class IonSqlQueryBuilder -{ - private static final DateTimeFormatter FORMATTER = date().withChronology(getInstanceUTC()); - private static final String DATA_SOURCE = "S3Object s"; - private final TypeManager typeManager; - private final S3SelectDataType s3SelectDataType; - private final String nullPredicate; - private final String notNullPredicate; - - public IonSqlQueryBuilder(TypeManager typeManager, S3SelectDataType s3SelectDataType, Optional optionalNullCharacterEncoding) - { - if (optionalNullCharacterEncoding.isPresent()) { - checkArgument(s3SelectDataType == CSV, "Null character encoding should only be provided for CSV data"); - } - - this.typeManager = requireNonNull(typeManager, "typeManager is null"); - this.s3SelectDataType = requireNonNull(s3SelectDataType, "s3SelectDataType is null"); - - String nullCharacterEncoding = optionalNullCharacterEncoding.orElse(""); - this.nullPredicate = switch (s3SelectDataType) { - case JSON -> "IS NULL"; - case CSV -> "= '%s'".formatted(nullCharacterEncoding); - }; - this.notNullPredicate = switch (s3SelectDataType) { - case JSON -> "IS NOT NULL"; - case CSV -> "!= '%s'".formatted(nullCharacterEncoding); - }; - } - - public String buildSql(List columns, TupleDomain tupleDomain) - { - columns.forEach(column -> checkArgument(column.isBaseColumn(), "%s is not a base column", column)); - tupleDomain.getDomains().ifPresent(domains -> { - domains.keySet().forEach(column -> checkArgument(column.isBaseColumn(), "%s is not a base column", column)); - }); - - // SELECT clause - StringBuilder sql = new StringBuilder("SELECT "); - - if (columns.isEmpty()) { - sql.append("' '"); - } - else { - String columnNames = columns.stream() - .map(this::getFullyQualifiedColumnName) - .collect(joining(", ")); - sql.append(columnNames); - } - - // FROM clause - sql.append(" FROM "); - sql.append(DATA_SOURCE); - - // WHERE clause - List clauses = toConjuncts(columns, tupleDomain); - if (!clauses.isEmpty()) { - sql.append(" WHERE ") - .append(Joiner.on(" AND ").join(clauses)); - } - - return sql.toString(); - } - - private String getFullyQualifiedColumnName(HiveColumnHandle column) - { - return switch (s3SelectDataType) { - case JSON -> "s.%s".formatted(column.getBaseColumnName()); - case CSV -> "s._%d".formatted(column.getBaseHiveColumnIndex() + 1); - }; - } - - private List toConjuncts(List columns, TupleDomain tupleDomain) - { - ImmutableList.Builder builder = ImmutableList.builder(); - for (HiveColumnHandle column : columns) { - Type type = column.getHiveType().getType(typeManager); - if (tupleDomain.getDomains().isPresent() && isSupported(type)) { - Domain domain = tupleDomain.getDomains().get().get(column); - if (domain != null) { - builder.add(toPredicate(domain, type, column)); - } - } - } - return builder.build(); - } - - private static boolean isSupported(Type type) - { - Type validType = requireNonNull(type, "type is null"); - return validType.equals(BIGINT) || - validType.equals(TINYINT) || - validType.equals(SMALLINT) || - validType.equals(INTEGER) || - validType.equals(BOOLEAN) || - validType.equals(DATE) || - validType instanceof VarcharType; - } - - private String toPredicate(Domain domain, Type type, HiveColumnHandle column) - { - checkArgument(domain.getType().isOrderable(), "Domain type must be orderable"); - - if (domain.getValues().isNone()) { - if (domain.isNullAllowed()) { - return getFullyQualifiedColumnName(column) + " " + nullPredicate; - } - return "FALSE"; - } - - if (domain.getValues().isAll()) { - if (domain.isNullAllowed()) { - return "TRUE"; - } - return getFullyQualifiedColumnName(column) + " " + notNullPredicate; - } - - List disjuncts = new ArrayList<>(); - List singleValues = new ArrayList<>(); - for (Range range : domain.getValues().getRanges().getOrderedRanges()) { - checkState(!range.isAll()); - if (range.isSingleValue()) { - singleValues.add(range.getSingleValue()); - continue; - } - List rangeConjuncts = new ArrayList<>(); - if (!range.isLowUnbounded()) { - rangeConjuncts.add(toPredicate(range.isLowInclusive() ? ">=" : ">", range.getLowBoundedValue(), type, column)); - } - if (!range.isHighUnbounded()) { - rangeConjuncts.add(toPredicate(range.isHighInclusive() ? "<=" : "<", range.getHighBoundedValue(), type, column)); - } - // If rangeConjuncts is null, then the range was ALL, which should already have been checked for - checkState(!rangeConjuncts.isEmpty()); - if (rangeConjuncts.size() == 1) { - disjuncts.add("%s %s AND %s".formatted(getFullyQualifiedColumnName(column), notNullPredicate, getOnlyElement(rangeConjuncts))); - } - else { - disjuncts.add("(%s %s AND %s)".formatted(getFullyQualifiedColumnName(column), notNullPredicate, Joiner.on(" AND ").join(rangeConjuncts))); - } - } - - // Add back all of the possible single values either as an equality or an IN predicate - if (singleValues.size() == 1) { - disjuncts.add("%s %s AND %s".formatted(getFullyQualifiedColumnName(column), notNullPredicate, toPredicate("=", getOnlyElement(singleValues), type, column))); - } - else if (singleValues.size() > 1) { - List values = new ArrayList<>(); - for (Object value : singleValues) { - checkType(type); - values.add(valueToQuery(type, value)); - } - disjuncts.add("%s %s AND %s IN (%s)".formatted( - getFullyQualifiedColumnName(column), - notNullPredicate, - createColumn(type, column), - Joiner.on(",").join(values))); - } - - // Add nullability disjuncts - checkState(!disjuncts.isEmpty()); - if (domain.isNullAllowed()) { - disjuncts.add(getFullyQualifiedColumnName(column) + " " + nullPredicate); - } - - return "(" + Joiner.on(" OR ").join(disjuncts) + ")"; - } - - private String toPredicate(String operator, Object value, Type type, HiveColumnHandle column) - { - checkType(type); - - return format("%s %s %s", createColumn(type, column), operator, valueToQuery(type, value)); - } - - private static void checkType(Type type) - { - checkArgument(isSupported(type), "Type not supported: %s", type); - } - - private static String valueToQuery(Type type, Object value) - { - if (type.equals(BIGINT)) { - return String.valueOf((long) value); - } - if (type.equals(INTEGER)) { - return String.valueOf(toIntExact((long) value)); - } - if (type.equals(SMALLINT)) { - return String.valueOf(Shorts.checkedCast((long) value)); - } - if (type.equals(TINYINT)) { - return String.valueOf(SignedBytes.checkedCast((long) value)); - } - if (type.equals(BOOLEAN)) { - return String.valueOf((boolean) value); - } - if (type.equals(DATE)) { - // CAST('2007-04-05T14:30Z' AS TIMESTAMP) - return "'" + FORMATTER.print(DAYS.toMillis((long) value)) + "'"; - } - if (type.equals(VarcharType.VARCHAR)) { - return "'" + ((Slice) value).toStringUtf8().replace("'", "''") + "'"; - } - return "'" + ((Slice) value).toStringUtf8() + "'"; - } - - private String createColumn(Type type, HiveColumnHandle columnHandle) - { - String column = getFullyQualifiedColumnName(columnHandle); - - if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { - return "CAST(" + column + " AS INT)"; - } - if (type.equals(BOOLEAN)) { - return "CAST(" + column + " AS BOOL)"; - } - return column; - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectDataType.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectDataType.java deleted file mode 100644 index 70872574d5dbc..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectDataType.java +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -public enum S3SelectDataType { - CSV, - JSON -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectLineRecordReader.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectLineRecordReader.java deleted file mode 100644 index ec74e14a77591..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectLineRecordReader.java +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.amazonaws.AbortedException; -import com.amazonaws.services.s3.model.AmazonS3Exception; -import com.amazonaws.services.s3.model.CompressionType; -import com.amazonaws.services.s3.model.InputSerialization; -import com.amazonaws.services.s3.model.OutputSerialization; -import com.amazonaws.services.s3.model.ScanRange; -import com.amazonaws.services.s3.model.SelectObjectContentRequest; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.io.Closer; -import com.google.errorprone.annotations.ThreadSafe; -import io.airlift.units.Duration; -import io.trino.hdfs.s3.HiveS3Config; -import io.trino.hdfs.s3.TrinoS3FileSystem; -import io.trino.spi.TrinoException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.compress.BZip2Codec; -import org.apache.hadoop.io.compress.CompressionCodec; -import org.apache.hadoop.io.compress.CompressionCodecFactory; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.util.LineReader; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InterruptedIOException; -import java.net.URI; -import java.nio.charset.StandardCharsets; -import java.util.Properties; - -import static com.amazonaws.services.s3.model.ExpressionType.SQL; -import static com.google.common.base.Throwables.throwIfInstanceOf; -import static com.google.common.base.Throwables.throwIfUnchecked; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_MAX_BACKOFF_TIME; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_MAX_CLIENT_RETRIES; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_MAX_RETRY_TIME; -import static io.trino.plugin.hive.util.RetryDriver.retry; -import static io.trino.plugin.hive.util.SerdeConstants.LINE_DELIM; -import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; -import static java.lang.String.format; -import static java.net.HttpURLConnection.HTTP_BAD_REQUEST; -import static java.net.HttpURLConnection.HTTP_FORBIDDEN; -import static java.net.HttpURLConnection.HTTP_NOT_FOUND; -import static java.util.Objects.requireNonNull; -import static java.util.concurrent.TimeUnit.SECONDS; - -@ThreadSafe -public abstract class S3SelectLineRecordReader - implements RecordReader -{ - private InputStream selectObjectContent; - private long processedRecords; - private long recordsFromS3; - private long position; - private LineReader reader; - private boolean isFirstLine; - private static final Duration BACKOFF_MIN_SLEEP = new Duration(1, SECONDS); - private final TrinoS3SelectClient selectClient; - private final long start; - private final long end; - private final int maxAttempts; - private final Duration maxBackoffTime; - private final Duration maxRetryTime; - private final Closer closer = Closer.create(); - private final SelectObjectContentRequest selectObjectContentRequest; - private final CompressionCodecFactory compressionCodecFactory; - private final String lineDelimiter; - private final Properties schema; - private final CompressionType compressionType; - - public S3SelectLineRecordReader( - Configuration configuration, - Path path, - long start, - long length, - Properties schema, - String ionSqlQuery, - TrinoS3ClientFactory s3ClientFactory) - { - requireNonNull(configuration, "configuration is null"); - requireNonNull(schema, "schema is null"); - requireNonNull(path, "path is null"); - requireNonNull(ionSqlQuery, "ionSqlQuery is null"); - requireNonNull(s3ClientFactory, "s3ClientFactory is null"); - this.lineDelimiter = (schema).getProperty(LINE_DELIM, "\n"); - this.processedRecords = 0; - this.recordsFromS3 = 0; - this.start = start; - this.position = this.start; - this.end = this.start + length; - this.isFirstLine = true; - - this.compressionCodecFactory = new CompressionCodecFactory(configuration); - this.compressionType = getCompressionType(path); - this.schema = schema; - this.selectObjectContentRequest = buildSelectObjectRequest(ionSqlQuery, path); - - HiveS3Config defaults = new HiveS3Config(); - this.maxAttempts = configuration.getInt(S3_MAX_CLIENT_RETRIES, defaults.getS3MaxClientRetries()) + 1; - this.maxBackoffTime = Duration.valueOf(configuration.get(S3_MAX_BACKOFF_TIME, defaults.getS3MaxBackoffTime().toString())); - this.maxRetryTime = Duration.valueOf(configuration.get(S3_MAX_RETRY_TIME, defaults.getS3MaxRetryTime().toString())); - - this.selectClient = new TrinoS3SelectClient(configuration, s3ClientFactory); - closer.register(selectClient); - } - - protected abstract InputSerialization buildInputSerialization(); - - protected abstract OutputSerialization buildOutputSerialization(); - - protected abstract boolean shouldEnableScanRange(); - - protected Properties getSchema() - { - return schema; - } - - protected CompressionType getCompressionType() - { - return compressionType; - } - - public SelectObjectContentRequest buildSelectObjectRequest(String query, Path path) - { - SelectObjectContentRequest selectObjectRequest = new SelectObjectContentRequest(); - URI uri = path.toUri(); - selectObjectRequest.setBucketName(TrinoS3FileSystem.extractBucketName(uri)); - selectObjectRequest.setKey(TrinoS3FileSystem.keyFromPath(path)); - selectObjectRequest.setExpression(query); - selectObjectRequest.setExpressionType(SQL); - - InputSerialization selectObjectInputSerialization = buildInputSerialization(); - selectObjectRequest.setInputSerialization(selectObjectInputSerialization); - - OutputSerialization selectObjectOutputSerialization = buildOutputSerialization(); - selectObjectRequest.setOutputSerialization(selectObjectOutputSerialization); - - if (shouldEnableScanRange()) { - ScanRange scanRange = new ScanRange(); - scanRange.setStart(getStart()); - scanRange.setEnd(getEnd()); - selectObjectRequest.setScanRange(scanRange); - } - - return selectObjectRequest; - } - - protected CompressionType getCompressionType(Path path) - { - CompressionCodec codec = compressionCodecFactory.getCodec(path); - if (codec == null) { - return CompressionType.NONE; - } - if (codec instanceof GzipCodec) { - return CompressionType.GZIP; - } - if (codec instanceof BZip2Codec) { - return CompressionType.BZIP2; - } - throw new TrinoException(NOT_SUPPORTED, "Compression extension not supported for S3 Select: " + path); - } - - private int readLine(Text value) - throws IOException - { - try { - return retry() - .maxAttempts(maxAttempts) - .exponentialBackoff(BACKOFF_MIN_SLEEP, maxBackoffTime, maxRetryTime, 2.0) - .stopOn(InterruptedException.class, UnrecoverableS3OperationException.class, AbortedException.class) - .run("readRecordsContentStream", () -> { - if (isFirstLine) { - recordsFromS3 = 0; - selectObjectContent = selectClient.getRecordsContent(selectObjectContentRequest); - closer.register(selectObjectContent); - reader = new LineReader(selectObjectContent, lineDelimiter.getBytes(StandardCharsets.UTF_8)); - closer.register(reader); - isFirstLine = false; - } - try { - return reader.readLine(value); - } - catch (RuntimeException e) { - isFirstLine = true; - recordsFromS3 = 0; - if (e instanceof AmazonS3Exception) { - switch (((AmazonS3Exception) e).getStatusCode()) { - case HTTP_FORBIDDEN: - case HTTP_NOT_FOUND: - case HTTP_BAD_REQUEST: - throw new UnrecoverableS3OperationException(selectClient.getBucketName(), selectClient.getKeyName(), e); - } - } - throw e; - } - }); - } - catch (InterruptedException | AbortedException e) { - Thread.currentThread().interrupt(); - throw new InterruptedIOException(); - } - catch (Exception e) { - throwIfInstanceOf(e, IOException.class); - throwIfUnchecked(e); - throw new RuntimeException(e); - } - } - - @Override - public synchronized boolean next(LongWritable key, Text value) - throws IOException - { - while (true) { - int bytes = readLine(value); - if (bytes <= 0) { - if (!selectClient.isRequestComplete()) { - throw new IOException("S3 Select request was incomplete as End Event was not received"); - } - return false; - } - recordsFromS3++; - if (recordsFromS3 > processedRecords) { - position += bytes; - processedRecords++; - key.set(processedRecords); - return true; - } - } - } - - @Override - public LongWritable createKey() - { - return new LongWritable(); - } - - @Override - public Text createValue() - { - return new Text(); - } - - @Override - public long getPos() - { - return position; - } - - @Override - public void close() - throws IOException - { - closer.close(); - } - - @Override - public float getProgress() - { - return ((float) (position - start)) / (end - start); - } - - /** - * This exception is for stopping retries for S3 Select calls that shouldn't be retried. - * For example, "Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: Forbidden (Service: Amazon S3; Status Code: 403 ..." - */ - @VisibleForTesting - static class UnrecoverableS3OperationException - extends RuntimeException - { - public UnrecoverableS3OperationException(String bucket, String key, Throwable cause) - { - // append bucket and key to the message - super(format("%s (Bucket: %s, Key: %s)", cause, bucket, key)); - } - } - - protected long getStart() - { - return start; - } - - protected long getEnd() - { - return end; - } - - protected String getLineDelimiter() - { - return lineDelimiter; - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectLineRecordReaderProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectLineRecordReaderProvider.java deleted file mode 100644 index 49221c3982805..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectLineRecordReaderProvider.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import io.trino.plugin.hive.s3select.csv.S3SelectCsvRecordReader; -import io.trino.plugin.hive.s3select.json.S3SelectJsonRecordReader; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - -import java.util.Optional; -import java.util.Properties; - -/** - * Returns an S3SelectLineRecordReader based on the serDe class. It supports CSV and JSON formats, and - * will not push down any other formats. - */ -public class S3SelectLineRecordReaderProvider -{ - private S3SelectLineRecordReaderProvider() {} - - public static Optional get(Configuration configuration, - Path path, - long start, - long length, - Properties schema, - String ionSqlQuery, - TrinoS3ClientFactory s3ClientFactory, - S3SelectDataType dataType) - { - switch (dataType) { - case CSV: - return Optional.of(new S3SelectCsvRecordReader(configuration, path, start, length, schema, ionSqlQuery, s3ClientFactory)); - case JSON: - return Optional.of(new S3SelectJsonRecordReader(configuration, path, start, length, schema, ionSqlQuery, s3ClientFactory)); - default: - // return empty if data type is not returned by the serDeMapper or unrecognizable by the LineRecordReader - return Optional.empty(); - } - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectPushdown.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectPushdown.java deleted file mode 100644 index b98e3f7c3bd81..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectPushdown.java +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.google.common.collect.ImmutableSet; -import io.trino.hive.formats.compression.CompressionKind; -import io.trino.plugin.hive.metastore.Column; -import io.trino.plugin.hive.metastore.Partition; -import io.trino.plugin.hive.metastore.Table; -import io.trino.spi.connector.ConnectorSession; - -import java.util.List; -import java.util.Objects; -import java.util.Optional; -import java.util.Properties; -import java.util.Set; - -import static io.trino.plugin.hive.HiveMetadata.SKIP_FOOTER_COUNT_KEY; -import static io.trino.plugin.hive.HiveMetadata.SKIP_HEADER_COUNT_KEY; -import static io.trino.plugin.hive.HiveSessionProperties.isS3SelectPushdownEnabled; -import static io.trino.plugin.hive.metastore.MetastoreUtil.getHiveSchema; -import static io.trino.plugin.hive.util.HiveClassNames.TEXT_INPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName; -import static io.trino.plugin.hive.util.HiveUtil.getInputFormatName; -import static java.util.Objects.requireNonNull; - -/** - * S3SelectPushdown uses Amazon S3 Select to push down queries to Amazon S3. This allows Presto to retrieve only a - * subset of data rather than retrieving the full S3 object thus improving Presto query performance. - */ -public final class S3SelectPushdown -{ - private static final Set SUPPORTED_S3_PREFIXES = ImmutableSet.of("s3://", "s3a://", "s3n://"); - - /* - * Double and Real Types lose precision. Thus, they are not pushed down to S3. - * Correctness problems have also been observed with Decimal columns. - * - * When S3 select support was added, Trino did not properly implement TIMESTAMP semantic. This was fixed in 2020, and TIMESTAMPS may be supportable now - * (https://github.com/trinodb/trino/issues/10962). Pushing down timestamps to s3select maybe still be problematic due to ION SQL comparing timestamps - * using precision. This means timestamps with different precisions are not equal even actually they present the same instant of time. - */ - private static final Set SUPPORTED_COLUMN_TYPES = ImmutableSet.of( - "boolean", - "int", - "tinyint", - "smallint", - "bigint", - "string", - "date"); - - private S3SelectPushdown() {} - - private static boolean isSerDeSupported(Properties schema) - { - String serdeName = getDeserializerClassName(schema); - return S3SelectSerDeDataTypeMapper.doesSerDeExist(serdeName); - } - - private static boolean isInputFormatSupported(Properties schema) - { - if (isTextInputFormat(schema)) { - if (!Objects.equals(schema.getProperty(SKIP_HEADER_COUNT_KEY, "0"), "0")) { - // S3 Select supports skipping one line of headers, but it was returning incorrect results for trino-hive-hadoop2/conf/files/test_table_with_header.csv.gz - // TODO https://github.com/trinodb/trino/issues/2349 - return false; - } - - // S3 Select does not support skipping footers - return Objects.equals(schema.getProperty(SKIP_FOOTER_COUNT_KEY, "0"), "0"); - } - - return false; - } - - public static boolean isCompressionCodecSupported(Properties schema, String path) - { - if (isTextInputFormat(schema)) { - // S3 Select supports the following formats: uncompressed, GZIP and BZIP2. - return CompressionKind.forFile(path) - .map(kind -> kind == CompressionKind.GZIP || kind == CompressionKind.BZIP2) - .orElse(true); - } - - return false; - } - - public static boolean isSplittable(boolean s3SelectPushdownEnabled, Properties schema, String path) - { - if (!s3SelectPushdownEnabled) { - return true; - } - - // S3 Select supports splitting uncompressed files - if (isTextInputFormat(schema) && CompressionKind.forFile(path).isEmpty()) { - return isSerDeSupported(schema); - } - - return false; - } - - private static boolean isTextInputFormat(Properties schema) - { - return TEXT_INPUT_FORMAT_CLASS.equals(getInputFormatName(schema).orElse(null)); - } - - private static boolean areColumnTypesSupported(List columns) - { - requireNonNull(columns, "columns is null"); - - if (columns.isEmpty()) { - return false; - } - - for (Column column : columns) { - if (!SUPPORTED_COLUMN_TYPES.contains(column.getType().getHiveTypeName().toString())) { - return false; - } - } - - return true; - } - - private static boolean isS3Storage(String path) - { - return SUPPORTED_S3_PREFIXES.stream().anyMatch(path::startsWith); - } - - public static boolean shouldEnablePushdownForTable(ConnectorSession session, Table table, String path, Optional optionalPartition) - { - if (!isS3SelectPushdownEnabled(session)) { - return false; - } - - if (path == null) { - return false; - } - - // Hive table partitions could be on different storages, - // as a result, we have to check each individual optionalPartition - Properties schema = optionalPartition - .map(partition -> getHiveSchema(partition, table)) - .orElseGet(() -> getHiveSchema(table)); - return shouldEnablePushdownForTable(table, path, schema); - } - - private static boolean shouldEnablePushdownForTable(Table table, String path, Properties schema) - { - return isS3Storage(path) && - isSerDeSupported(schema) && - isInputFormatSupported(schema) && - areColumnTypesSupported(table.getDataColumns()); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursor.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursor.java deleted file mode 100644 index ac8b03646ec28..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursor.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.google.common.annotations.VisibleForTesting; -import io.trino.plugin.hive.GenericHiveRecordCursor; -import io.trino.plugin.hive.HiveColumnHandle; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.RecordReader; - -import java.util.List; -import java.util.Properties; -import java.util.stream.Collectors; - -import static io.trino.plugin.hive.util.SerdeConstants.LIST_COLUMNS; -import static io.trino.plugin.hive.util.SerdeConstants.LIST_COLUMN_TYPES; -import static java.util.Objects.requireNonNull; - -class S3SelectRecordCursor - extends GenericHiveRecordCursor -{ - public S3SelectRecordCursor( - Configuration configuration, - Path path, - RecordReader recordReader, - long totalBytes, - Properties splitSchema, - List columns) - { - super(configuration, path, recordReader, totalBytes, updateSplitSchema(splitSchema, columns), columns); - } - - // since s3select only returns the required column, not the whole columns - // we need to update the split schema to include only the required columns - // otherwise, Serde could not deserialize output from s3select to row data correctly - @VisibleForTesting - static Properties updateSplitSchema(Properties splitSchema, List columns) - { - requireNonNull(splitSchema, "splitSchema is null"); - requireNonNull(columns, "columns is null"); - // clone split properties for update so as not to affect the original one - Properties updatedSchema = new Properties(); - updatedSchema.putAll(splitSchema); - updatedSchema.setProperty(LIST_COLUMNS, buildColumns(columns)); - updatedSchema.setProperty(LIST_COLUMN_TYPES, buildColumnTypes(columns)); - return updatedSchema; - } - - private static String buildColumns(List columns) - { - if (columns == null || columns.isEmpty()) { - return ""; - } - return columns.stream() - .map(HiveColumnHandle::getName) - .collect(Collectors.joining(",")); - } - - private static String buildColumnTypes(List columns) - { - if (columns == null || columns.isEmpty()) { - return ""; - } - return columns.stream() - .map(column -> column.getHiveType().getTypeInfo().getTypeName()) - .collect(Collectors.joining(",")); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java deleted file mode 100644 index da318adc4df1d..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableSet; -import com.google.inject.Inject; -import io.trino.filesystem.Location; -import io.trino.hdfs.HdfsEnvironment; -import io.trino.plugin.hive.HiveColumnHandle; -import io.trino.plugin.hive.HiveConfig; -import io.trino.plugin.hive.HiveRecordCursorProvider; -import io.trino.plugin.hive.ReaderColumns; -import io.trino.plugin.hive.s3select.csv.S3SelectCsvRecordReader; -import io.trino.plugin.hive.type.TypeInfo; -import io.trino.spi.TrinoException; -import io.trino.spi.connector.ConnectorSession; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.predicate.TupleDomain; -import io.trino.spi.type.TypeManager; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; -import java.util.Properties; -import java.util.Set; -import java.util.function.Function; - -import static com.google.common.collect.ImmutableList.toImmutableList; -import static com.google.common.collect.ImmutableSet.toImmutableSet; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; -import static io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns; -import static io.trino.plugin.hive.s3select.S3SelectDataType.CSV; -import static io.trino.plugin.hive.type.TypeInfoUtils.getTypeInfosFromTypeString; -import static io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName; -import static io.trino.plugin.hive.util.SerdeConstants.COLUMN_NAME_DELIMITER; -import static io.trino.plugin.hive.util.SerdeConstants.LIST_COLUMNS; -import static io.trino.plugin.hive.util.SerdeConstants.LIST_COLUMN_TYPES; -import static java.util.Objects.requireNonNull; - -public class S3SelectRecordCursorProvider - implements HiveRecordCursorProvider -{ - private final HdfsEnvironment hdfsEnvironment; - private final TrinoS3ClientFactory s3ClientFactory; - private final boolean experimentalPushdownEnabled; - - @Inject - public S3SelectRecordCursorProvider(HdfsEnvironment hdfsEnvironment, TrinoS3ClientFactory s3ClientFactory, HiveConfig hiveConfig) - { - this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); - this.s3ClientFactory = requireNonNull(s3ClientFactory, "s3ClientFactory is null"); - this.experimentalPushdownEnabled = hiveConfig.isS3SelectExperimentalPushdownEnabled(); - } - - @Override - public Optional createRecordCursor( - Configuration configuration, - ConnectorSession session, - Location location, - long start, - long length, - long fileSize, - Properties schema, - List columns, - TupleDomain effectivePredicate, - TypeManager typeManager, - boolean s3SelectPushdownEnabled) - { - if (!s3SelectPushdownEnabled) { - return Optional.empty(); - } - - Path path = new Path(location.toString()); - try { - this.hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration); - } - catch (IOException e) { - throw new TrinoException(HIVE_FILESYSTEM_ERROR, "Failed getting FileSystem: " + path, e); - } - - Optional projectedReaderColumns = projectBaseColumns(columns); - // Ignore predicates on partial columns for now. - effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn()); - - List readerColumns = projectedReaderColumns - .map(readColumns -> readColumns.get().stream().map(HiveColumnHandle.class::cast).collect(toImmutableList())) - .orElseGet(() -> ImmutableList.copyOf(columns)); - // Query is not going to filter any data, no need to use S3 Select - if (!hasFilters(schema, effectivePredicate, readerColumns)) { - return Optional.empty(); - } - - String serdeName = getDeserializerClassName(schema); - Optional s3SelectDataTypeOptional = S3SelectSerDeDataTypeMapper.getDataType(serdeName); - - if (s3SelectDataTypeOptional.isPresent()) { - S3SelectDataType s3SelectDataType = s3SelectDataTypeOptional.get(); - if (s3SelectDataType == CSV && !experimentalPushdownEnabled) { - return Optional.empty(); - } - - Optional nullCharacterEncoding = Optional.empty(); - if (s3SelectDataType == CSV) { - nullCharacterEncoding = S3SelectCsvRecordReader.nullCharacterEncoding(schema); - } - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager, s3SelectDataType, nullCharacterEncoding); - String ionSqlQuery = queryBuilder.buildSql(readerColumns, effectivePredicate); - Optional recordReader = S3SelectLineRecordReaderProvider.get(configuration, path, start, length, schema, - ionSqlQuery, s3ClientFactory, s3SelectDataType); - - if (recordReader.isEmpty()) { - // S3 Select data type is not mapped to an S3SelectLineRecordReader - return Optional.empty(); - } - - RecordCursor cursor = new S3SelectRecordCursor<>(configuration, path, recordReader.get(), length, schema, readerColumns); - return Optional.of(new ReaderRecordCursorWithProjections(cursor, projectedReaderColumns)); - } - // unsupported serdes - return Optional.empty(); - } - - private static boolean hasFilters( - Properties schema, - TupleDomain effectivePredicate, - List readerColumns) - { - //There are no effective predicates and readercolumns and columntypes are identical to schema - //means getting all data out of S3. We can use S3 GetObject instead of S3 SelectObjectContent in these cases. - if (effectivePredicate.isAll()) { - return !isEquivalentSchema(readerColumns, schema); - } - return true; - } - - private static boolean isEquivalentSchema(List readerColumns, Properties schema) - { - Set projectedColumnNames = getColumnProperty(readerColumns, HiveColumnHandle::getName); - Set projectedColumnTypes = getColumnProperty(readerColumns, column -> column.getHiveType().getTypeInfo().getTypeName()); - return isEquivalentColumns(projectedColumnNames, schema) && isEquivalentColumnTypes(projectedColumnTypes, schema); - } - - private static boolean isEquivalentColumns(Set projectedColumnNames, Properties schema) - { - Set columnNames; - String columnNameProperty = schema.getProperty(LIST_COLUMNS); - if (columnNameProperty.length() == 0) { - columnNames = ImmutableSet.of(); - } - else { - String columnNameDelimiter = (String) schema.getOrDefault(COLUMN_NAME_DELIMITER, ","); - columnNames = Arrays.stream(columnNameProperty.split(columnNameDelimiter)) - .collect(toImmutableSet()); - } - return projectedColumnNames.equals(columnNames); - } - - private static boolean isEquivalentColumnTypes(Set projectedColumnTypes, Properties schema) - { - String columnTypeProperty = schema.getProperty(LIST_COLUMN_TYPES); - Set columnTypes; - if (columnTypeProperty.length() == 0) { - columnTypes = ImmutableSet.of(); - } - else { - columnTypes = getTypeInfosFromTypeString(columnTypeProperty) - .stream() - .map(TypeInfo::getTypeName) - .collect(toImmutableSet()); - } - return projectedColumnTypes.equals(columnTypes); - } - - private static Set getColumnProperty(List readerColumns, Function mapper) - { - if (readerColumns.isEmpty()) { - return ImmutableSet.of(); - } - return readerColumns.stream() - .map(mapper) - .collect(toImmutableSet()); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectSerDeDataTypeMapper.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectSerDeDataTypeMapper.java deleted file mode 100644 index 4695eb1a7e3be..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectSerDeDataTypeMapper.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import java.util.Map; -import java.util.Optional; - -import static io.trino.plugin.hive.util.HiveClassNames.JSON_SERDE_CLASS; -import static io.trino.plugin.hive.util.HiveClassNames.LAZY_SIMPLE_SERDE_CLASS; - -public class S3SelectSerDeDataTypeMapper -{ - // Contains mapping of SerDe class name -> data type. Multiple SerDe classes can be mapped to the same data type. - private static final Map serDeToDataTypeMapping = Map.of( - LAZY_SIMPLE_SERDE_CLASS, S3SelectDataType.CSV, - JSON_SERDE_CLASS, S3SelectDataType.JSON); - - private S3SelectSerDeDataTypeMapper() {} - - public static Optional getDataType(String serdeName) - { - return Optional.ofNullable(serDeToDataTypeMapping.get(serdeName)); - } - - public static boolean doesSerDeExist(String serdeName) - { - return serDeToDataTypeMapping.containsKey(serdeName); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/TrinoS3ClientFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/TrinoS3ClientFactory.java deleted file mode 100644 index 9a016c8e41ece..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/TrinoS3ClientFactory.java +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.amazonaws.ClientConfiguration; -import com.amazonaws.Protocol; -import com.amazonaws.SdkClientException; -import com.amazonaws.auth.AWSCredentials; -import com.amazonaws.auth.AWSCredentialsProvider; -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.auth.BasicSessionCredentials; -import com.amazonaws.auth.DefaultAWSCredentialsProviderChain; -import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider; -import com.amazonaws.regions.DefaultAwsRegionProviderChain; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3Builder; -import com.amazonaws.services.s3.AmazonS3Client; -import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder; -import com.google.errorprone.annotations.concurrent.GuardedBy; -import com.google.inject.Inject; -import io.airlift.log.Logger; -import io.airlift.units.Duration; -import io.trino.hdfs.s3.HiveS3Config; -import io.trino.hdfs.s3.TrinoS3FileSystem; -import io.trino.plugin.hive.HiveConfig; -import org.apache.hadoop.conf.Configuration; - -import java.net.URI; -import java.util.Optional; - -import static com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration; -import static com.amazonaws.regions.Regions.US_EAST_1; -import static com.google.common.base.Strings.isNullOrEmpty; -import static com.google.common.base.Verify.verify; -import static io.trino.hdfs.s3.AwsCurrentRegionHolder.getCurrentRegionFromEC2Metadata; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_ACCESS_KEY; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_CONNECT_TIMEOUT; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_CONNECT_TTL; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_CREDENTIALS_PROVIDER; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_ENDPOINT; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_EXTERNAL_ID; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_IAM_ROLE; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_MAX_ERROR_RETRIES; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_PIN_CLIENT_TO_CURRENT_REGION; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_ROLE_SESSION_NAME; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_SECRET_KEY; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_SESSION_TOKEN; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_SOCKET_TIMEOUT; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_SSL_ENABLED; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_STS_ENDPOINT; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_STS_REGION; -import static io.trino.hdfs.s3.TrinoS3FileSystem.S3_USER_AGENT_PREFIX; -import static java.lang.Math.toIntExact; -import static java.lang.String.format; - -/** - * This factory provides AmazonS3 client required for executing S3SelectPushdown requests. - * Normal S3 GET requests use AmazonS3 clients initialized in {@link TrinoS3FileSystem} or EMRFS. - * The ideal state will be to merge this logic with the two file systems and get rid of this - * factory class. - * Please do not use the client provided by this factory for any other use cases. - */ -public class TrinoS3ClientFactory -{ - private static final Logger log = Logger.get(TrinoS3ClientFactory.class); - private static final String S3_SELECT_PUSHDOWN_MAX_CONNECTIONS = "hive.s3select-pushdown.max-connections"; - - private final boolean enabled; - private final int defaultMaxConnections; - - @GuardedBy("this") - private AmazonS3 s3Client; - - @Inject - public TrinoS3ClientFactory(HiveConfig config) - { - this.enabled = config.isS3SelectPushdownEnabled(); - this.defaultMaxConnections = config.getS3SelectPushdownMaxConnections(); - } - - synchronized AmazonS3 getS3Client(Configuration config) - { - if (s3Client == null) { - s3Client = createS3Client(config); - } - return s3Client; - } - - private AmazonS3 createS3Client(Configuration config) - { - HiveS3Config defaults = new HiveS3Config(); - String userAgentPrefix = config.get(S3_USER_AGENT_PREFIX, defaults.getS3UserAgentPrefix()); - int maxErrorRetries = config.getInt(S3_MAX_ERROR_RETRIES, defaults.getS3MaxErrorRetries()); - boolean sslEnabled = config.getBoolean(S3_SSL_ENABLED, defaults.isS3SslEnabled()); - Duration connectTimeout = Duration.valueOf(config.get(S3_CONNECT_TIMEOUT, defaults.getS3ConnectTimeout().toString())); - Duration socketTimeout = Duration.valueOf(config.get(S3_SOCKET_TIMEOUT, defaults.getS3SocketTimeout().toString())); - int maxConnections = config.getInt(S3_SELECT_PUSHDOWN_MAX_CONNECTIONS, defaultMaxConnections); - - ClientConfiguration clientConfiguration = new ClientConfiguration() - .withMaxErrorRetry(maxErrorRetries) - .withProtocol(sslEnabled ? Protocol.HTTPS : Protocol.HTTP) - .withConnectionTimeout(toIntExact(connectTimeout.toMillis())) - .withSocketTimeout(toIntExact(socketTimeout.toMillis())) - .withMaxConnections(maxConnections) - .withUserAgentPrefix(userAgentPrefix) - .withUserAgentSuffix(enabled ? "Trino-select" : "Trino"); - - String connectTtlValue = config.get(S3_CONNECT_TTL); - if (!isNullOrEmpty(connectTtlValue)) { - clientConfiguration.setConnectionTTL(Duration.valueOf(connectTtlValue).toMillis()); - } - - AWSCredentialsProvider awsCredentialsProvider = getAwsCredentialsProvider(config); - AmazonS3Builder, ? extends AmazonS3> clientBuilder = AmazonS3Client.builder() - .withCredentials(awsCredentialsProvider) - .withClientConfiguration(clientConfiguration) - .withMetricsCollector(TrinoS3FileSystem.getFileSystemStats().newRequestMetricCollector()) - .enablePathStyleAccess(); - - boolean regionOrEndpointSet = false; - - String endpoint = config.get(S3_ENDPOINT); - boolean pinS3ClientToCurrentRegion = config.getBoolean(S3_PIN_CLIENT_TO_CURRENT_REGION, defaults.isPinS3ClientToCurrentRegion()); - verify(!pinS3ClientToCurrentRegion || endpoint == null, - "Invalid configuration: either endpoint can be set or S3 client can be pinned to the current region"); - - // use local region when running inside of EC2 - if (pinS3ClientToCurrentRegion) { - clientBuilder.setRegion(getCurrentRegionFromEC2Metadata().getName()); - regionOrEndpointSet = true; - } - - if (!isNullOrEmpty(endpoint)) { - clientBuilder.withEndpointConfiguration(new EndpointConfiguration(endpoint, null)); - regionOrEndpointSet = true; - } - - if (!regionOrEndpointSet) { - clientBuilder.withRegion(US_EAST_1); - clientBuilder.setForceGlobalBucketAccessEnabled(true); - } - - return clientBuilder.build(); - } - - private static AWSCredentialsProvider getAwsCredentialsProvider(Configuration conf) - { - Optional credentials = getAwsCredentials(conf); - if (credentials.isPresent()) { - return new AWSStaticCredentialsProvider(credentials.get()); - } - - String providerClass = conf.get(S3_CREDENTIALS_PROVIDER); - if (!isNullOrEmpty(providerClass)) { - return getCustomAWSCredentialsProvider(conf, providerClass); - } - - AWSCredentialsProvider provider = getAwsCredentials(conf) - .map(value -> (AWSCredentialsProvider) new AWSStaticCredentialsProvider(value)) - .orElseGet(DefaultAWSCredentialsProviderChain::getInstance); - - String iamRole = conf.get(S3_IAM_ROLE); - if (iamRole != null) { - String stsEndpointOverride = conf.get(S3_STS_ENDPOINT); - String stsRegionOverride = conf.get(S3_STS_REGION); - String s3RoleSessionName = conf.get(S3_ROLE_SESSION_NAME); - String externalId = conf.get(S3_EXTERNAL_ID); - - AWSSecurityTokenServiceClientBuilder stsClientBuilder = AWSSecurityTokenServiceClientBuilder.standard() - .withCredentials(provider); - - String region; - if (!isNullOrEmpty(stsRegionOverride)) { - region = stsRegionOverride; - } - else { - DefaultAwsRegionProviderChain regionProviderChain = new DefaultAwsRegionProviderChain(); - try { - region = regionProviderChain.getRegion(); - } - catch (SdkClientException ex) { - log.warn("Falling back to default AWS region %s", US_EAST_1); - region = US_EAST_1.getName(); - } - } - - if (!isNullOrEmpty(stsEndpointOverride)) { - stsClientBuilder.withEndpointConfiguration(new EndpointConfiguration(stsEndpointOverride, region)); - } - else { - stsClientBuilder.withRegion(region); - } - - provider = new STSAssumeRoleSessionCredentialsProvider.Builder(iamRole, s3RoleSessionName) - .withExternalId(externalId) - .withStsClient(stsClientBuilder.build()) - .build(); - } - - return provider; - } - - private static AWSCredentialsProvider getCustomAWSCredentialsProvider(Configuration conf, String providerClass) - { - try { - return conf.getClassByName(providerClass) - .asSubclass(AWSCredentialsProvider.class) - .getConstructor(URI.class, Configuration.class) - .newInstance(null, conf); - } - catch (ReflectiveOperationException e) { - throw new RuntimeException(format("Error creating an instance of %s", providerClass), e); - } - } - - private static Optional getAwsCredentials(Configuration conf) - { - String accessKey = conf.get(S3_ACCESS_KEY); - String secretKey = conf.get(S3_SECRET_KEY); - - if (isNullOrEmpty(accessKey) || isNullOrEmpty(secretKey)) { - return Optional.empty(); - } - String sessionToken = conf.get(S3_SESSION_TOKEN); - if (!isNullOrEmpty(sessionToken)) { - return Optional.of(new BasicSessionCredentials(accessKey, secretKey, sessionToken)); - } - - return Optional.of(new BasicAWSCredentials(accessKey, secretKey)); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/TrinoS3SelectClient.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/TrinoS3SelectClient.java deleted file mode 100644 index e42777ac40f8c..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/TrinoS3SelectClient.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor; -import com.amazonaws.services.s3.model.SelectObjectContentRequest; -import com.amazonaws.services.s3.model.SelectObjectContentResult; -import org.apache.hadoop.conf.Configuration; - -import java.io.Closeable; -import java.io.IOException; -import java.io.InputStream; - -import static com.amazonaws.services.s3.model.SelectObjectContentEvent.EndEvent; -import static java.util.Objects.requireNonNull; - -class TrinoS3SelectClient - implements Closeable -{ - private final AmazonS3 s3Client; - private boolean requestComplete; - private SelectObjectContentRequest selectObjectRequest; - private SelectObjectContentResult selectObjectContentResult; - - public TrinoS3SelectClient(Configuration configuration, TrinoS3ClientFactory s3ClientFactory) - { - requireNonNull(configuration, "configuration is null"); - requireNonNull(s3ClientFactory, "s3ClientFactory is null"); - this.s3Client = s3ClientFactory.getS3Client(configuration); - } - - public InputStream getRecordsContent(SelectObjectContentRequest selectObjectRequest) - { - this.selectObjectRequest = requireNonNull(selectObjectRequest, "selectObjectRequest is null"); - this.selectObjectContentResult = s3Client.selectObjectContent(selectObjectRequest); - return selectObjectContentResult.getPayload() - .getRecordsInputStream( - new SelectObjectContentEventVisitor() - { - @Override - public void visit(EndEvent endEvent) - { - requestComplete = true; - } - }); - } - - @Override - public void close() - throws IOException - { - selectObjectContentResult.close(); - } - - public String getKeyName() - { - return selectObjectRequest.getKey(); - } - - public String getBucketName() - { - return selectObjectRequest.getBucketName(); - } - - /** - * The End Event indicates all matching records have been transmitted. - * If the End Event is not received, the results may be incomplete. - */ - public boolean isRequestComplete() - { - return requestComplete; - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/csv/S3SelectCsvRecordReader.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/csv/S3SelectCsvRecordReader.java deleted file mode 100644 index 89dc6d3c91028..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/csv/S3SelectCsvRecordReader.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select.csv; - -import com.amazonaws.services.s3.model.CSVInput; -import com.amazonaws.services.s3.model.CSVOutput; -import com.amazonaws.services.s3.model.CompressionType; -import com.amazonaws.services.s3.model.InputSerialization; -import com.amazonaws.services.s3.model.OutputSerialization; -import io.trino.plugin.hive.s3select.S3SelectLineRecordReader; -import io.trino.plugin.hive.s3select.TrinoS3ClientFactory; -import io.trino.plugin.hive.util.SerdeConstants; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - -import java.util.Optional; -import java.util.Properties; - -import static io.trino.plugin.hive.util.SerdeConstants.ESCAPE_CHAR; -import static io.trino.plugin.hive.util.SerdeConstants.FIELD_DELIM; -import static io.trino.plugin.hive.util.SerdeConstants.QUOTE_CHAR; - -public class S3SelectCsvRecordReader - extends S3SelectLineRecordReader -{ - /* - * Sentinel unicode comment character (http://www.unicode.org/faq/private_use.html#nonchar_codes). - * It is expected that \uFDD0 sentinel comment character is not the first character in any row of user's CSV S3 object. - * The rows starting with \uFDD0 will be skipped by S3Select and will not be a part of the result set or aggregations. - * To process CSV objects that may contain \uFDD0 as first row character please disable S3SelectPushdown. - * TODO: Remove this proxy logic when S3Select API supports disabling of row level comments. - */ - - private static final String COMMENTS_CHAR_STR = "\uFDD0"; - private static final String DEFAULT_FIELD_DELIMITER = ","; - - public S3SelectCsvRecordReader( - Configuration configuration, - Path path, - long start, - long length, - Properties schema, - String ionSqlQuery, - TrinoS3ClientFactory s3ClientFactory) - { - super(configuration, path, start, length, schema, ionSqlQuery, s3ClientFactory); - } - - @Override - public InputSerialization buildInputSerialization() - { - Properties schema = getSchema(); - String fieldDelimiter = schema.getProperty(FIELD_DELIM, DEFAULT_FIELD_DELIMITER); - String quoteChar = schema.getProperty(QUOTE_CHAR, null); - String escapeChar = schema.getProperty(ESCAPE_CHAR, null); - - CSVInput selectObjectCSVInputSerialization = new CSVInput(); - selectObjectCSVInputSerialization.setRecordDelimiter(getLineDelimiter()); - selectObjectCSVInputSerialization.setFieldDelimiter(fieldDelimiter); - selectObjectCSVInputSerialization.setComments(COMMENTS_CHAR_STR); - selectObjectCSVInputSerialization.setQuoteCharacter(quoteChar); - selectObjectCSVInputSerialization.setQuoteEscapeCharacter(escapeChar); - - InputSerialization selectObjectInputSerialization = new InputSerialization(); - selectObjectInputSerialization.setCompressionType(getCompressionType()); - selectObjectInputSerialization.setCsv(selectObjectCSVInputSerialization); - - return selectObjectInputSerialization; - } - - @Override - public OutputSerialization buildOutputSerialization() - { - Properties schema = getSchema(); - String fieldDelimiter = schema.getProperty(FIELD_DELIM, DEFAULT_FIELD_DELIMITER); - String quoteChar = schema.getProperty(QUOTE_CHAR, null); - String escapeChar = schema.getProperty(ESCAPE_CHAR, null); - - OutputSerialization selectObjectOutputSerialization = new OutputSerialization(); - CSVOutput selectObjectCSVOutputSerialization = new CSVOutput(); - selectObjectCSVOutputSerialization.setRecordDelimiter(getLineDelimiter()); - selectObjectCSVOutputSerialization.setFieldDelimiter(fieldDelimiter); - selectObjectCSVOutputSerialization.setQuoteCharacter(quoteChar); - selectObjectCSVOutputSerialization.setQuoteEscapeCharacter(escapeChar); - selectObjectOutputSerialization.setCsv(selectObjectCSVOutputSerialization); - - return selectObjectOutputSerialization; - } - - @Override - public boolean shouldEnableScanRange() - { - // Works for CSV if AllowQuotedRecordDelimiter is disabled. - boolean isQuotedRecordDelimiterAllowed = Boolean.TRUE.equals( - buildInputSerialization().getCsv().getAllowQuotedRecordDelimiter()); - return CompressionType.NONE.equals(getCompressionType()) && !isQuotedRecordDelimiterAllowed; - } - - public static Optional nullCharacterEncoding(Properties schema) - { - return Optional.ofNullable(schema.getProperty(SerdeConstants.SERIALIZATION_NULL_FORMAT)); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/json/S3SelectJsonRecordReader.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/json/S3SelectJsonRecordReader.java deleted file mode 100644 index fa7d7be846540..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/json/S3SelectJsonRecordReader.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select.json; - -import com.amazonaws.services.s3.model.CompressionType; -import com.amazonaws.services.s3.model.InputSerialization; -import com.amazonaws.services.s3.model.JSONInput; -import com.amazonaws.services.s3.model.JSONOutput; -import com.amazonaws.services.s3.model.JSONType; -import com.amazonaws.services.s3.model.OutputSerialization; -import io.trino.plugin.hive.s3select.S3SelectLineRecordReader; -import io.trino.plugin.hive.s3select.TrinoS3ClientFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - -import java.util.Properties; - -public class S3SelectJsonRecordReader - extends S3SelectLineRecordReader -{ - public S3SelectJsonRecordReader(Configuration configuration, - Path path, - long start, - long length, - Properties schema, - String ionSqlQuery, - TrinoS3ClientFactory s3ClientFactory) - { - super(configuration, path, start, length, schema, ionSqlQuery, s3ClientFactory); - } - - @Override - public InputSerialization buildInputSerialization() - { - // JSONType.LINES is the only JSON format supported by the Hive JsonSerDe. - JSONInput selectObjectJSONInputSerialization = new JSONInput(); - selectObjectJSONInputSerialization.setType(JSONType.LINES); - - InputSerialization selectObjectInputSerialization = new InputSerialization(); - selectObjectInputSerialization.setCompressionType(getCompressionType()); - selectObjectInputSerialization.setJson(selectObjectJSONInputSerialization); - - return selectObjectInputSerialization; - } - - @Override - public OutputSerialization buildOutputSerialization() - { - OutputSerialization selectObjectOutputSerialization = new OutputSerialization(); - JSONOutput selectObjectJSONOutputSerialization = new JSONOutput(); - selectObjectOutputSerialization.setJson(selectObjectJSONOutputSerialization); - - return selectObjectOutputSerialization; - } - - @Override - public boolean shouldEnableScanRange() - { - return CompressionType.NONE.equals(getCompressionType()); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/FieldSetterFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/FieldSetterFactory.java deleted file mode 100644 index fa5e799f603e1..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/FieldSetterFactory.java +++ /dev/null @@ -1,500 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.util; - -import com.google.common.collect.ImmutableList; -import io.trino.spi.block.Block; -import io.trino.spi.type.ArrayType; -import io.trino.spi.type.CharType; -import io.trino.spi.type.DecimalType; -import io.trino.spi.type.LongTimestamp; -import io.trino.spi.type.MapType; -import io.trino.spi.type.RowType; -import io.trino.spi.type.TimestampType; -import io.trino.spi.type.Type; -import io.trino.spi.type.VarcharType; -import org.apache.hadoop.hive.common.type.Timestamp; -import org.apache.hadoop.hive.serde2.io.DateWritableV2; -import org.apache.hadoop.hive.serde2.io.DoubleWritable; -import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; -import org.apache.hadoop.hive.serde2.io.ShortWritable; -import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; -import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.io.BooleanWritable; -import org.apache.hadoop.io.ByteWritable; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.joda.time.DateTimeZone; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import static io.trino.plugin.hive.util.HiveWriteUtils.getField; -import static io.trino.plugin.hive.util.HiveWriteUtils.getHiveDecimal; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.spi.type.BooleanType.BOOLEAN; -import static io.trino.spi.type.DateType.DATE; -import static io.trino.spi.type.DoubleType.DOUBLE; -import static io.trino.spi.type.IntegerType.INTEGER; -import static io.trino.spi.type.RealType.REAL; -import static io.trino.spi.type.SmallintType.SMALLINT; -import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_SECOND; -import static io.trino.spi.type.Timestamps.MILLISECONDS_PER_SECOND; -import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_MICROSECOND; -import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_NANOSECOND; -import static io.trino.spi.type.TinyintType.TINYINT; -import static io.trino.spi.type.VarbinaryType.VARBINARY; -import static java.lang.Math.floorDiv; -import static java.lang.Math.floorMod; -import static java.lang.Math.toIntExact; -import static java.util.Objects.requireNonNull; - -public final class FieldSetterFactory -{ - private final DateTimeZone timeZone; - - public FieldSetterFactory(DateTimeZone timeZone) - { - this.timeZone = requireNonNull(timeZone, "timeZone is null"); - } - - public FieldSetter create(SettableStructObjectInspector rowInspector, Object row, StructField field, Type type) - { - if (BOOLEAN.equals(type)) { - return new BooleanFieldSetter(rowInspector, row, field); - } - if (BIGINT.equals(type)) { - return new BigintFieldSetter(rowInspector, row, field); - } - if (INTEGER.equals(type)) { - return new IntFieldSetter(rowInspector, row, field); - } - if (SMALLINT.equals(type)) { - return new SmallintFieldSetter(rowInspector, row, field); - } - if (TINYINT.equals(type)) { - return new TinyintFieldSetter(rowInspector, row, field); - } - if (REAL.equals(type)) { - return new FloatFieldSetter(rowInspector, row, field); - } - if (DOUBLE.equals(type)) { - return new DoubleFieldSetter(rowInspector, row, field); - } - if (type instanceof VarcharType) { - return new VarcharFieldSetter(rowInspector, row, field, type); - } - if (type instanceof CharType) { - return new CharFieldSetter(rowInspector, row, field, type); - } - if (VARBINARY.equals(type)) { - return new BinaryFieldSetter(rowInspector, row, field); - } - if (DATE.equals(type)) { - return new DateFieldSetter(rowInspector, row, field); - } - if (type instanceof TimestampType timestampType) { - return new TimestampFieldSetter(rowInspector, row, field, timestampType, timeZone); - } - if (type instanceof DecimalType decimalType) { - return new DecimalFieldSetter(rowInspector, row, field, decimalType); - } - if (type instanceof ArrayType arrayType) { - return new ArrayFieldSetter(rowInspector, row, field, arrayType.getElementType()); - } - if (type instanceof MapType mapType) { - return new MapFieldSetter(rowInspector, row, field, mapType.getKeyType(), mapType.getValueType()); - } - if (type instanceof RowType) { - return new RowFieldSetter(rowInspector, row, field, type.getTypeParameters()); - } - throw new IllegalArgumentException("unsupported type: " + type); - } - - public abstract static class FieldSetter - { - protected final SettableStructObjectInspector rowInspector; - protected final Object row; - protected final StructField field; - - private FieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) - { - this.rowInspector = requireNonNull(rowInspector, "rowInspector is null"); - this.row = requireNonNull(row, "row is null"); - this.field = requireNonNull(field, "field is null"); - } - - public abstract void setField(Block block, int position); - } - - private static class BooleanFieldSetter - extends FieldSetter - { - private final BooleanWritable value = new BooleanWritable(); - - public BooleanFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) - { - super(rowInspector, row, field); - } - - @Override - public void setField(Block block, int position) - { - value.set(BOOLEAN.getBoolean(block, position)); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class BigintFieldSetter - extends FieldSetter - { - private final LongWritable value = new LongWritable(); - - public BigintFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) - { - super(rowInspector, row, field); - } - - @Override - public void setField(Block block, int position) - { - value.set(BIGINT.getLong(block, position)); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class IntFieldSetter - extends FieldSetter - { - private final IntWritable value = new IntWritable(); - - public IntFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) - { - super(rowInspector, row, field); - } - - @Override - public void setField(Block block, int position) - { - value.set(INTEGER.getInt(block, position)); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class SmallintFieldSetter - extends FieldSetter - { - private final ShortWritable value = new ShortWritable(); - - public SmallintFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) - { - super(rowInspector, row, field); - } - - @Override - public void setField(Block block, int position) - { - value.set(SMALLINT.getShort(block, position)); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class TinyintFieldSetter - extends FieldSetter - { - private final ByteWritable value = new ByteWritable(); - - public TinyintFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) - { - super(rowInspector, row, field); - } - - @Override - public void setField(Block block, int position) - { - value.set(TINYINT.getByte(block, position)); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class DoubleFieldSetter - extends FieldSetter - { - private final DoubleWritable value = new DoubleWritable(); - - public DoubleFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) - { - super(rowInspector, row, field); - } - - @Override - public void setField(Block block, int position) - { - value.set(DOUBLE.getDouble(block, position)); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class FloatFieldSetter - extends FieldSetter - { - private final FloatWritable value = new FloatWritable(); - - public FloatFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) - { - super(rowInspector, row, field); - } - - @Override - public void setField(Block block, int position) - { - value.set(REAL.getFloat(block, position)); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class VarcharFieldSetter - extends FieldSetter - { - private final Text value = new Text(); - private final Type type; - - public VarcharFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, Type type) - { - super(rowInspector, row, field); - this.type = type; - } - - @Override - public void setField(Block block, int position) - { - value.set(type.getSlice(block, position).getBytes()); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class CharFieldSetter - extends FieldSetter - { - private final Text value = new Text(); - private final Type type; - - public CharFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, Type type) - { - super(rowInspector, row, field); - this.type = type; - } - - @Override - public void setField(Block block, int position) - { - value.set(type.getSlice(block, position).getBytes()); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class BinaryFieldSetter - extends FieldSetter - { - private final BytesWritable value = new BytesWritable(); - - public BinaryFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) - { - super(rowInspector, row, field); - } - - @Override - public void setField(Block block, int position) - { - byte[] bytes = VARBINARY.getSlice(block, position).getBytes(); - value.set(bytes, 0, bytes.length); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class DateFieldSetter - extends FieldSetter - { - private final DateWritableV2 value = new DateWritableV2(); - - public DateFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) - { - super(rowInspector, row, field); - } - - @Override - public void setField(Block block, int position) - { - value.set(DATE.getInt(block, position)); - rowInspector.setStructFieldData(row, field, value); - } - } - - private static class TimestampFieldSetter - extends FieldSetter - { - private final DateTimeZone timeZone; - private final TimestampType type; - private final TimestampWritableV2 value = new TimestampWritableV2(); - - public TimestampFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, TimestampType type, DateTimeZone timeZone) - { - super(rowInspector, row, field); - this.type = requireNonNull(type, "type is null"); - this.timeZone = requireNonNull(timeZone, "timeZone is null"); - } - - @Override - public void setField(Block block, int position) - { - long epochMicros; - int picosOfMicro; - if (type.isShort()) { - epochMicros = type.getLong(block, position); - picosOfMicro = 0; - } - else { - LongTimestamp longTimestamp = (LongTimestamp) type.getObject(block, position); - epochMicros = longTimestamp.getEpochMicros(); - picosOfMicro = longTimestamp.getPicosOfMicro(); - } - - long epochSeconds = floorDiv(epochMicros, MICROSECONDS_PER_SECOND); - long picosOfSecond = (long) floorMod(epochMicros, MICROSECONDS_PER_SECOND) * PICOSECONDS_PER_MICROSECOND + picosOfMicro; - - epochSeconds = convertLocalEpochSecondsToUtc(epochSeconds); - // no rounding since the data has nanosecond precision, at most - int nanosOfSecond = toIntExact(picosOfSecond / PICOSECONDS_PER_NANOSECOND); - - Timestamp timestamp = Timestamp.ofEpochSecond(epochSeconds, nanosOfSecond); - value.set(timestamp); - rowInspector.setStructFieldData(row, field, value); - } - - private long convertLocalEpochSecondsToUtc(long epochSeconds) - { - long epochMillis = epochSeconds * MILLISECONDS_PER_SECOND; - epochMillis = timeZone.convertLocalToUTC(epochMillis, false); - return epochMillis / MILLISECONDS_PER_SECOND; - } - } - - private static class DecimalFieldSetter - extends FieldSetter - { - private final HiveDecimalWritable value = new HiveDecimalWritable(); - private final DecimalType decimalType; - - public DecimalFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, DecimalType decimalType) - { - super(rowInspector, row, field); - this.decimalType = decimalType; - } - - @Override - public void setField(Block block, int position) - { - value.set(getHiveDecimal(decimalType, block, position)); - rowInspector.setStructFieldData(row, field, value); - } - } - - private class ArrayFieldSetter - extends FieldSetter - { - private final Type elementType; - - public ArrayFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, Type elementType) - { - super(rowInspector, row, field); - this.elementType = requireNonNull(elementType, "elementType is null"); - } - - @Override - public void setField(Block block, int position) - { - Block arrayBlock = block.getObject(position, Block.class); - - List list = new ArrayList<>(arrayBlock.getPositionCount()); - for (int i = 0; i < arrayBlock.getPositionCount(); i++) { - list.add(getField(timeZone, elementType, arrayBlock, i)); - } - - rowInspector.setStructFieldData(row, field, list); - } - } - - private class MapFieldSetter - extends FieldSetter - { - private final Type keyType; - private final Type valueType; - - public MapFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, Type keyType, Type valueType) - { - super(rowInspector, row, field); - this.keyType = requireNonNull(keyType, "keyType is null"); - this.valueType = requireNonNull(valueType, "valueType is null"); - } - - @Override - public void setField(Block block, int position) - { - Block mapBlock = block.getObject(position, Block.class); - Map map = new HashMap<>(mapBlock.getPositionCount() * 2); - for (int i = 0; i < mapBlock.getPositionCount(); i += 2) { - map.put( - getField(timeZone, keyType, mapBlock, i), - getField(timeZone, valueType, mapBlock, i + 1)); - } - - rowInspector.setStructFieldData(row, field, map); - } - } - - private class RowFieldSetter - extends FieldSetter - { - private final List fieldTypes; - - public RowFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, List fieldTypes) - { - super(rowInspector, row, field); - this.fieldTypes = ImmutableList.copyOf(fieldTypes); - } - - @Override - public void setField(Block block, int position) - { - Block rowBlock = block.getObject(position, Block.class); - - // TODO reuse row object and use FieldSetters, like we do at the top level - // Ideally, we'd use the same recursive structure starting from the top, but - // this requires modeling row types in the same way we model table rows - // (multiple blocks vs all fields packed in a single block) - List value = new ArrayList<>(fieldTypes.size()); - for (int i = 0; i < fieldTypes.size(); i++) { - value.add(getField(timeZone, fieldTypes.get(i), rowBlock, i)); - } - - rowInspector.setStructFieldData(row, field, value); - } - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/FooterAwareRecordReader.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/FooterAwareRecordReader.java deleted file mode 100644 index a118e80da243e..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/FooterAwareRecordReader.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.util; - -import org.apache.hadoop.hive.ql.exec.FooterBuffer; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkArgument; -import static java.util.Objects.requireNonNull; - -public class FooterAwareRecordReader, V extends Writable> - implements RecordReader -{ - private final RecordReader delegate; - private final JobConf job; - private final FooterBuffer footerBuffer = new FooterBuffer(); - - public FooterAwareRecordReader(RecordReader delegate, int footerCount, JobConf job) - throws IOException - { - this.delegate = requireNonNull(delegate, "delegate is null"); - this.job = requireNonNull(job, "job is null"); - - checkArgument(footerCount > 0, "footerCount is expected to be positive"); - - footerBuffer.initializeBuffer(job, delegate, footerCount, delegate.createKey(), delegate.createValue()); - } - - @Override - public boolean next(K key, V value) - throws IOException - { - return footerBuffer.updateBuffer(job, delegate, key, value); - } - - @Override - public K createKey() - { - return delegate.createKey(); - } - - @Override - public V createValue() - { - return delegate.createValue(); - } - - @Override - public long getPos() - throws IOException - { - return delegate.getPos(); - } - - @Override - public void close() - throws IOException - { - delegate.close(); - } - - @Override - public float getProgress() - throws IOException - { - return delegate.getProgress(); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/ForwardingRecordCursor.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/ForwardingRecordCursor.java deleted file mode 100644 index 3f1a410cd3f56..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/ForwardingRecordCursor.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.util; - -import io.airlift.slice.Slice; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.type.Type; - -public abstract class ForwardingRecordCursor - implements RecordCursor -{ - protected abstract RecordCursor delegate(); - - @Override - public long getCompletedBytes() - { - return delegate().getCompletedBytes(); - } - - @Override - public long getReadTimeNanos() - { - return delegate().getReadTimeNanos(); - } - - @Override - public Type getType(int field) - { - return delegate().getType(field); - } - - @Override - public boolean advanceNextPosition() - { - return delegate().advanceNextPosition(); - } - - @Override - public boolean getBoolean(int field) - { - return delegate().getBoolean(field); - } - - @Override - public long getLong(int field) - { - return delegate().getLong(field); - } - - @Override - public double getDouble(int field) - { - return delegate().getDouble(field); - } - - @Override - public Slice getSlice(int field) - { - return delegate().getSlice(field); - } - - @Override - public Object getObject(int field) - { - return delegate().getObject(field); - } - - @Override - public boolean isNull(int field) - { - return delegate().isNull(field); - } - - @Override - public long getMemoryUsage() - { - return delegate().getMemoryUsage(); - } - - @Override - public void close() - { - delegate().close(); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveReaderUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveReaderUtil.java deleted file mode 100644 index 80d82ead8531d..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveReaderUtil.java +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.util; - -import com.google.common.base.Joiner; -import com.google.common.base.Splitter; -import io.airlift.compress.lzo.LzoCodec; -import io.airlift.compress.lzo.LzopCodec; -import io.trino.hadoop.TextLineLengthLimitExceededException; -import io.trino.plugin.hive.HiveColumnHandle; -import io.trino.plugin.hive.HiveStorageFormat; -import io.trino.plugin.hive.avro.TrinoAvroSerDe; -import io.trino.spi.TrinoException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.AbstractSerDe; -import org.apache.hadoop.hive.serde2.Deserializer; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.util.ReflectionUtils; - -import java.io.IOException; -import java.util.List; -import java.util.Properties; - -import static com.google.common.base.MoreObjects.firstNonNull; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.collect.ImmutableList.toImmutableList; -import static com.google.common.collect.Lists.newArrayList; -import static io.trino.hdfs.ConfigurationUtils.copy; -import static io.trino.hdfs.ConfigurationUtils.toJobConf; -import static io.trino.hive.thrift.metastore.hive_metastoreConstants.FILE_INPUT_FORMAT; -import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_SERDE_NOT_FOUND; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT; -import static io.trino.plugin.hive.HiveStorageFormat.TEXTFILE; -import static io.trino.plugin.hive.util.HiveClassNames.AVRO_SERDE_CLASS; -import static io.trino.plugin.hive.util.HiveClassNames.LAZY_SIMPLE_SERDE_CLASS; -import static io.trino.plugin.hive.util.HiveClassNames.MAPRED_PARQUET_INPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.HiveClassNames.SYMLINK_TEXT_INPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.SerdeConstants.COLLECTION_DELIM; -import static org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_ALL_COLUMNS; -import static org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR; - -public final class HiveReaderUtil -{ - private HiveReaderUtil() {} - - public static RecordReader createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List columns) - { - // determine which hive columns we will read - List readColumns = columns.stream() - .filter(column -> column.getColumnType() == REGULAR) - .collect(toImmutableList()); - - // Projected columns are not supported here - readColumns.forEach(readColumn -> checkArgument(readColumn.isBaseColumn(), "column %s is not a base column", readColumn.getName())); - - List readHiveColumnIndexes = readColumns.stream() - .map(HiveColumnHandle::getBaseHiveColumnIndex) - .collect(toImmutableList()); - - // Tell hive the columns we would like to read, this lets hive optimize reading column oriented files - configuration = copy(configuration); - setReadColumns(configuration, readHiveColumnIndexes); - - InputFormat inputFormat = getInputFormat(configuration, schema); - JobConf jobConf = toJobConf(configuration); - FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null); - - // propagate serialization configuration to getRecordReader - schema.stringPropertyNames().stream() - .filter(name -> name.startsWith("serialization.")) - .forEach(name -> jobConf.set(name, schema.getProperty(name))); - - configureCompressionCodecs(jobConf); - - try { - @SuppressWarnings("unchecked") - RecordReader, ? extends Writable> recordReader = (RecordReader, ? extends Writable>) - inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); - - int headerCount = HiveUtil.getHeaderCount(schema); - // Only skip header rows when the split is at the beginning of the file - if (start == 0 && headerCount > 0) { - skipHeader(recordReader, headerCount); - } - - int footerCount = HiveUtil.getFooterCount(schema); - if (footerCount > 0) { - recordReader = new FooterAwareRecordReader<>(recordReader, footerCount, jobConf); - } - - return recordReader; - } - catch (IOException e) { - if (e instanceof TextLineLengthLimitExceededException) { - throw new TrinoException(HIVE_BAD_DATA, "Line too long in text file: " + path, e); - } - - throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, String.format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", - path, - start, - length, - HiveUtil.getInputFormatName(schema).orElse(null), - firstNonNull(e.getMessage(), e.getClass().getName())), - e); - } - } - - private static void skipHeader(RecordReader reader, int headerCount) - throws IOException - { - K key = reader.createKey(); - V value = reader.createValue(); - - while (headerCount > 0) { - if (!reader.next(key, value)) { - return; - } - headerCount--; - } - } - - private static void setReadColumns(Configuration configuration, List readHiveColumnIndexes) - { - configuration.set(READ_COLUMN_IDS_CONF_STR, Joiner.on(',').join(readHiveColumnIndexes)); - configuration.setBoolean(READ_ALL_COLUMNS, false); - } - - private static void configureCompressionCodecs(JobConf jobConf) - { - // add Airlift LZO and LZOP to head of codecs list to not override existing entries - List codecs = newArrayList(Splitter.on(",").trimResults().omitEmptyStrings().split(jobConf.get("io.compression.codecs", ""))); - if (!codecs.contains(LzoCodec.class.getName())) { - codecs.add(0, LzoCodec.class.getName()); - } - if (!codecs.contains(LzopCodec.class.getName())) { - codecs.add(0, LzopCodec.class.getName()); - } - jobConf.set("io.compression.codecs", String.join(",", codecs)); - } - - public static InputFormat getInputFormat(Configuration configuration, Properties schema) - { - String inputFormatName = HiveUtil.getInputFormatName(schema).orElseThrow(() -> - new TrinoException(HIVE_INVALID_METADATA, "Table or partition is missing Hive input format property: " + FILE_INPUT_FORMAT)); - try { - JobConf jobConf = toJobConf(configuration); - configureCompressionCodecs(jobConf); - - Class> inputFormatClass = getInputFormatClass(jobConf, inputFormatName); - if (inputFormatClass.getName().equals(SYMLINK_TEXT_INPUT_FORMAT_CLASS)) { - String serde = HiveUtil.getDeserializerClassName(schema); - // LazySimpleSerDe is used by TEXTFILE and SEQUENCEFILE. Default to TEXTFILE - // per Hive spec (https://hive.apache.org/javadocs/r2.1.1/api/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.html) - if (serde.equals(TEXTFILE.getSerde())) { - inputFormatClass = getInputFormatClass(jobConf, TEXTFILE.getInputFormat()); - return ReflectionUtils.newInstance(inputFormatClass, jobConf); - } - for (HiveStorageFormat format : HiveStorageFormat.values()) { - if (serde.equals(format.getSerde())) { - inputFormatClass = getInputFormatClass(jobConf, format.getInputFormat()); - return ReflectionUtils.newInstance(inputFormatClass, jobConf); - } - } - throw new TrinoException(HIVE_UNSUPPORTED_FORMAT, "Unknown SerDe for SymlinkTextInputFormat: " + serde); - } - - return ReflectionUtils.newInstance(inputFormatClass, jobConf); - } - catch (ClassNotFoundException | RuntimeException e) { - throw new TrinoException(HIVE_UNSUPPORTED_FORMAT, "Unable to create input format " + inputFormatName, e); - } - } - - @SuppressWarnings("unchecked") - private static Class> getInputFormatClass(JobConf conf, String inputFormatName) - throws ClassNotFoundException - { - // legacy names for Parquet - if ("parquet.hive.DeprecatedParquetInputFormat".equals(inputFormatName) || - "parquet.hive.MapredParquetInputFormat".equals(inputFormatName)) { - inputFormatName = MAPRED_PARQUET_INPUT_FORMAT_CLASS; - } - - Class clazz = conf.getClassByName(inputFormatName); - return (Class>) clazz.asSubclass(InputFormat.class); - } - - public static StructObjectInspector getTableObjectInspector(Deserializer deserializer) - { - try { - ObjectInspector inspector = deserializer.getObjectInspector(); - checkArgument(inspector.getCategory() == ObjectInspector.Category.STRUCT, "expected STRUCT: %s", inspector.getCategory()); - return (StructObjectInspector) inspector; - } - catch (SerDeException e) { - throw new RuntimeException(e); - } - } - - public static Deserializer getDeserializer(Configuration configuration, Properties schema) - { - String name = HiveUtil.getDeserializerClassName(schema); - - // for collection delimiter, Hive 1.x, 2.x uses "colelction.delim" but Hive 3.x uses "collection.delim" - // see also https://issues.apache.org/jira/browse/HIVE-16922 - if (name.equals(LAZY_SIMPLE_SERDE_CLASS)) { - if (schema.containsKey("colelction.delim") && !schema.containsKey(COLLECTION_DELIM)) { - schema.setProperty(COLLECTION_DELIM, schema.getProperty("colelction.delim")); - } - } - - Deserializer deserializer = createDeserializer(getDeserializerClass(name)); - initializeDeserializer(configuration, deserializer, schema); - return deserializer; - } - - private static Class getDeserializerClass(String name) - { - if (AVRO_SERDE_CLASS.equals(name)) { - return TrinoAvroSerDe.class; - } - - try { - return Class.forName(name).asSubclass(Deserializer.class); - } - catch (ClassNotFoundException e) { - throw new TrinoException(HIVE_SERDE_NOT_FOUND, "deserializer does not exist: " + name); - } - catch (ClassCastException e) { - throw new RuntimeException("invalid deserializer class: " + name); - } - } - - private static Deserializer createDeserializer(Class clazz) - { - try { - return clazz.getConstructor().newInstance(); - } - catch (ReflectiveOperationException e) { - throw new RuntimeException("error creating deserializer: " + clazz.getName(), e); - } - } - - private static void initializeDeserializer(Configuration configuration, Deserializer deserializer, Properties schema) - { - try { - configuration = copy(configuration); // Some SerDes (e.g. Avro) modify passed configuration - deserializer.initialize(configuration, schema); - validate(deserializer); - } - catch (SerDeException | RuntimeException e) { - throw new RuntimeException("error initializing deserializer: " + deserializer.getClass().getName(), e); - } - } - - private static void validate(Deserializer deserializer) - { - if (deserializer instanceof AbstractSerDe && !((AbstractSerDe) deserializer).getConfigurationErrors().isEmpty()) { - throw new RuntimeException("There are configuration errors: " + ((AbstractSerDe) deserializer).getConfigurationErrors()); - } - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java index c05d7091470b8..4ba8bb01e4d59 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java @@ -21,16 +21,13 @@ import io.trino.hdfs.rubix.CachingTrinoS3FileSystem; import io.trino.hdfs.s3.TrinoS3FileSystem; import io.trino.plugin.hive.HiveReadOnlyException; -import io.trino.plugin.hive.HiveTimestampPrecision; import io.trino.plugin.hive.HiveType; -import io.trino.plugin.hive.avro.AvroRecordWriter; import io.trino.plugin.hive.metastore.Database; import io.trino.plugin.hive.metastore.Partition; import io.trino.plugin.hive.metastore.ProtectMode; import io.trino.plugin.hive.metastore.SemiTransactionalHiveMetastore; import io.trino.plugin.hive.metastore.Storage; import io.trino.plugin.hive.metastore.Table; -import io.trino.plugin.hive.parquet.ParquetRecordWriter; import io.trino.plugin.hive.type.ListTypeInfo; import io.trino.plugin.hive.type.MapTypeInfo; import io.trino.plugin.hive.type.PrimitiveCategory; @@ -38,350 +35,143 @@ import io.trino.plugin.hive.type.StructTypeInfo; import io.trino.plugin.hive.type.TypeInfo; import io.trino.spi.Page; -import io.trino.spi.StandardErrorCode; import io.trino.spi.TrinoException; import io.trino.spi.block.Block; -import io.trino.spi.connector.ConnectorSession; import io.trino.spi.connector.SchemaNotFoundException; import io.trino.spi.connector.SchemaTableName; -import io.trino.spi.type.ArrayType; import io.trino.spi.type.CharType; import io.trino.spi.type.DecimalType; -import io.trino.spi.type.Int128; -import io.trino.spi.type.LongTimestamp; -import io.trino.spi.type.MapType; -import io.trino.spi.type.RowType; -import io.trino.spi.type.TimestampType; -import io.trino.spi.type.TimestampWithTimeZoneType; import io.trino.spi.type.Type; import io.trino.spi.type.VarcharType; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.viewfs.ViewFileSystem; import org.apache.hadoop.hdfs.DistributedFileSystem; -import org.apache.hadoop.hive.common.type.Date; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.common.type.Timestamp; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; -import org.apache.hadoop.hive.ql.io.HiveOutputFormat; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.Serializer; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Reporter; -import org.joda.time.DateTimeZone; import java.io.FileNotFoundException; import java.io.IOException; -import java.math.BigInteger; -import java.util.ArrayList; -import java.util.HashMap; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.temporal.ChronoField; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Properties; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Verify.verify; -import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.io.BaseEncoding.base16; import static io.trino.hdfs.FileSystemUtils.getRawFileSystem; import static io.trino.hdfs.s3.HiveS3Module.EMR_FS_CLASS_NAME; import static io.trino.plugin.hive.HiveErrorCode.HIVE_DATABASE_LOCATION_ERROR; import static io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_SERDE_NOT_FOUND; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_DATA_ERROR; import static io.trino.plugin.hive.HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION; import static io.trino.plugin.hive.TableType.MANAGED_TABLE; import static io.trino.plugin.hive.TableType.MATERIALIZED_VIEW; import static io.trino.plugin.hive.metastore.MetastoreUtil.getProtectMode; import static io.trino.plugin.hive.metastore.MetastoreUtil.verifyOnline; -import static io.trino.plugin.hive.type.VarcharTypeInfo.MAX_VARCHAR_LENGTH; -import static io.trino.plugin.hive.util.HiveClassNames.AVRO_CONTAINER_OUTPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.HiveClassNames.HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.HiveClassNames.HIVE_SEQUENCEFILE_OUTPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.HiveClassNames.MAPRED_PARQUET_OUTPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.HiveUtil.checkCondition; import static io.trino.plugin.hive.util.HiveUtil.escapeTableName; -import static io.trino.plugin.hive.util.HiveUtil.isStructuralType; import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.BooleanType.BOOLEAN; import static io.trino.spi.type.Chars.padSpaces; import static io.trino.spi.type.DateTimeEncoding.unpackMillisUtc; import static io.trino.spi.type.DateType.DATE; +import static io.trino.spi.type.Decimals.readBigDecimal; import static io.trino.spi.type.DoubleType.DOUBLE; import static io.trino.spi.type.IntegerType.INTEGER; import static io.trino.spi.type.RealType.REAL; import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS; import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS; -import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND; import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_SECOND; -import static io.trino.spi.type.Timestamps.MILLISECONDS_PER_SECOND; import static io.trino.spi.type.Timestamps.NANOSECONDS_PER_MICROSECOND; -import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_NANOSECOND; import static io.trino.spi.type.TinyintType.TINYINT; -import static io.trino.spi.type.VarbinaryType.VARBINARY; import static java.lang.Math.floorDiv; import static java.lang.Math.floorMod; import static java.lang.String.format; import static java.nio.charset.StandardCharsets.UTF_8; -import static java.util.Collections.unmodifiableList; -import static java.util.Collections.unmodifiableMap; import static java.util.UUID.randomUUID; -import static java.util.stream.Collectors.toList; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.COMPRESSRESULT; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDateObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaShortObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableByteObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableDateObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableFloatObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableHiveCharObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableIntObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableLongObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableShortObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableStringObjectInspector; -import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableTimestampObjectInspector; -import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getCharTypeInfo; -import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getVarcharTypeInfo; public final class HiveWriteUtils { - private HiveWriteUtils() - { - } - - public static RecordWriter createRecordWriter(Path target, JobConf conf, Properties properties, String outputFormatName, ConnectorSession session) - { - return createRecordWriter(target, conf, properties, outputFormatName, session, Optional.empty()); - } - - public static RecordWriter createRecordWriter(Path target, JobConf conf, Properties properties, String outputFormatName, ConnectorSession session, Optional textHeaderWriter) - { - try { - boolean compress = HiveConf.getBoolVar(conf, COMPRESSRESULT); - if (outputFormatName.equals(MAPRED_PARQUET_OUTPUT_FORMAT_CLASS)) { - return ParquetRecordWriter.create(target, conf, properties, session); - } - if (outputFormatName.equals(HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS)) { - return new TextRecordWriter(target, conf, properties, compress, textHeaderWriter); - } - if (outputFormatName.equals(HIVE_SEQUENCEFILE_OUTPUT_FORMAT_CLASS)) { - return new SequenceFileRecordWriter(target, conf, Text.class, compress); - } - if (outputFormatName.equals(AVRO_CONTAINER_OUTPUT_FORMAT_CLASS)) { - return new AvroRecordWriter(target, conf, compress, properties); - } - Object writer = Class.forName(outputFormatName).getConstructor().newInstance(); - return ((HiveOutputFormat) writer).getHiveRecordWriter(conf, target, Text.class, compress, properties, Reporter.NULL); - } - catch (IOException | ReflectiveOperationException e) { - throw new TrinoException(HIVE_WRITER_DATA_ERROR, e); - } - } + private static final DateTimeFormatter HIVE_DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd"); + private static final DateTimeFormatter HIVE_TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")) + .optionalStart().appendFraction(ChronoField.NANO_OF_SECOND, 0, 9, true).optionalEnd() + .toFormatter(); - public static Serializer initializeSerializer(Configuration conf, Properties properties, String serializerName) - { - try { - Serializer result = (Serializer) Class.forName(serializerName).getConstructor().newInstance(); - result.initialize(conf, properties); - return result; - } - catch (ClassNotFoundException e) { - throw new TrinoException(HIVE_SERDE_NOT_FOUND, "Serializer does not exist: " + serializerName); - } - catch (SerDeException | ReflectiveOperationException e) { - throw new TrinoException(HIVE_WRITER_DATA_ERROR, e); - } - } - - public static ObjectInspector getJavaObjectInspector(Type type) + private HiveWriteUtils() { - if (type.equals(BOOLEAN)) { - return javaBooleanObjectInspector; - } - if (type.equals(BIGINT)) { - return javaLongObjectInspector; - } - if (type.equals(INTEGER)) { - return javaIntObjectInspector; - } - if (type.equals(SMALLINT)) { - return javaShortObjectInspector; - } - if (type.equals(TINYINT)) { - return javaByteObjectInspector; - } - if (type.equals(REAL)) { - return javaFloatObjectInspector; - } - if (type.equals(DOUBLE)) { - return javaDoubleObjectInspector; - } - if (type instanceof VarcharType) { - return writableStringObjectInspector; - } - if (type instanceof CharType) { - return writableHiveCharObjectInspector; - } - if (type.equals(VARBINARY)) { - return javaByteArrayObjectInspector; - } - if (type.equals(DATE)) { - return javaDateObjectInspector; - } - if (type instanceof TimestampType) { - return javaTimestampObjectInspector; - } - if (type instanceof DecimalType decimalType) { - return getPrimitiveJavaObjectInspector(new DecimalTypeInfo(decimalType.getPrecision(), decimalType.getScale())); - } - if (type instanceof ArrayType arrayType) { - return ObjectInspectorFactory.getStandardListObjectInspector(getJavaObjectInspector(arrayType.getElementType())); - } - if (type instanceof MapType mapType) { - ObjectInspector keyObjectInspector = getJavaObjectInspector(mapType.getKeyType()); - ObjectInspector valueObjectInspector = getJavaObjectInspector(mapType.getValueType()); - return ObjectInspectorFactory.getStandardMapObjectInspector(keyObjectInspector, valueObjectInspector); - } - if (type instanceof RowType) { - return ObjectInspectorFactory.getStandardStructObjectInspector( - type.getTypeSignature().getParameters().stream() - .map(parameter -> parameter.getNamedTypeSignature().getName().get()) - .collect(toImmutableList()), - type.getTypeParameters().stream() - .map(HiveWriteUtils::getJavaObjectInspector) - .collect(toImmutableList())); - } - throw new IllegalArgumentException("unsupported type: " + type); } public static List createPartitionValues(List partitionColumnTypes, Page partitionColumns, int position) { ImmutableList.Builder partitionValues = ImmutableList.builder(); for (int field = 0; field < partitionColumns.getChannelCount(); field++) { - Object value = getField(DateTimeZone.UTC, partitionColumnTypes.get(field), partitionColumns.getBlock(field), position); - if (value == null) { - partitionValues.add(HIVE_DEFAULT_DYNAMIC_PARTITION); - } - else { - String valueString = value.toString(); - if (!CharMatcher.inRange((char) 0x20, (char) 0x7E).matchesAllOf(valueString)) { - throw new TrinoException(HIVE_INVALID_PARTITION_VALUE, - "Hive partition keys can only contain printable ASCII characters (0x20 - 0x7E). Invalid value: " + - base16().withSeparator(" ", 2).encode(valueString.getBytes(UTF_8))); - } - partitionValues.add(valueString); + String value = toPartitionValue(partitionColumnTypes.get(field), partitionColumns.getBlock(field), position); + if (!CharMatcher.inRange((char) 0x20, (char) 0x7E).matchesAllOf(value)) { + String encoded = base16().withSeparator(" ", 2).encode(value.getBytes(UTF_8)); + throw new TrinoException(HIVE_INVALID_PARTITION_VALUE, "Hive partition keys can only contain printable ASCII characters (0x20 - 0x7E). Invalid value: " + encoded); } + partitionValues.add(value); } return partitionValues.build(); } - public static Object getField(DateTimeZone localZone, Type type, Block block, int position) + private static String toPartitionValue(Type type, Block block, int position) { + // see HiveUtil#isValidPartitionType if (block.isNull(position)) { - return null; + return HIVE_DEFAULT_DYNAMIC_PARTITION; } if (BOOLEAN.equals(type)) { - return BOOLEAN.getBoolean(block, position); + return String.valueOf(BOOLEAN.getBoolean(block, position)); } if (BIGINT.equals(type)) { - return BIGINT.getLong(block, position); + return String.valueOf(BIGINT.getLong(block, position)); } if (INTEGER.equals(type)) { - return INTEGER.getInt(block, position); + return String.valueOf(INTEGER.getInt(block, position)); } if (SMALLINT.equals(type)) { - return SMALLINT.getShort(block, position); + return String.valueOf(SMALLINT.getShort(block, position)); } if (TINYINT.equals(type)) { - return TINYINT.getByte(block, position); + return String.valueOf(TINYINT.getByte(block, position)); } if (REAL.equals(type)) { - return REAL.getFloat(block, position); + return String.valueOf(REAL.getFloat(block, position)); } if (DOUBLE.equals(type)) { - return DOUBLE.getDouble(block, position); + return String.valueOf(DOUBLE.getDouble(block, position)); } if (type instanceof VarcharType varcharType) { - return new Text(varcharType.getSlice(block, position).getBytes()); + return varcharType.getSlice(block, position).toStringUtf8(); } if (type instanceof CharType charType) { - return new Text(padSpaces(charType.getSlice(block, position), charType).toStringUtf8()); - } - if (VARBINARY.equals(type)) { - return VARBINARY.getSlice(block, position).getBytes(); + return padSpaces(charType.getSlice(block, position), charType).toStringUtf8(); } if (DATE.equals(type)) { - return Date.ofEpochDay(DATE.getInt(block, position)); + return LocalDate.ofEpochDay(DATE.getInt(block, position)).format(HIVE_DATE_FORMATTER); } - if (type instanceof TimestampType timestampType) { - return getHiveTimestamp(localZone, timestampType, block, position); + if (TIMESTAMP_MILLIS.equals(type)) { + long epochMicros = type.getLong(block, position); + long epochSeconds = floorDiv(epochMicros, MICROSECONDS_PER_SECOND); + int nanosOfSecond = floorMod(epochMicros, MICROSECONDS_PER_SECOND) * NANOSECONDS_PER_MICROSECOND; + return LocalDateTime.ofEpochSecond(epochSeconds, nanosOfSecond, ZoneOffset.UTC).format(HIVE_TIMESTAMP_FORMATTER); } - if (type instanceof TimestampWithTimeZoneType) { - checkArgument(type.equals(TIMESTAMP_TZ_MILLIS)); - return getHiveTimestampTz(block, position); + if (TIMESTAMP_TZ_MILLIS.equals(type)) { + long epochMillis = unpackMillisUtc(type.getLong(block, position)); + return LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMillis), ZoneOffset.UTC).format(HIVE_TIMESTAMP_FORMATTER); } if (type instanceof DecimalType decimalType) { - return getHiveDecimal(decimalType, block, position); - } - if (type instanceof ArrayType arrayType) { - Type elementType = arrayType.getElementType(); - Block arrayBlock = block.getObject(position, Block.class); - - List list = new ArrayList<>(arrayBlock.getPositionCount()); - for (int i = 0; i < arrayBlock.getPositionCount(); i++) { - list.add(getField(localZone, elementType, arrayBlock, i)); - } - return unmodifiableList(list); - } - if (type instanceof MapType mapType) { - Type keyType = mapType.getKeyType(); - Type valueType = mapType.getValueType(); - Block mapBlock = block.getObject(position, Block.class); - - Map map = new HashMap<>(); - for (int i = 0; i < mapBlock.getPositionCount(); i += 2) { - map.put( - getField(localZone, keyType, mapBlock, i), - getField(localZone, valueType, mapBlock, i + 1)); - } - return unmodifiableMap(map); - } - if (type instanceof RowType rowType) { - List fieldTypes = rowType.getTypeParameters(); - Block rowBlock = block.getObject(position, Block.class); - checkCondition( - fieldTypes.size() == rowBlock.getPositionCount(), - StandardErrorCode.GENERIC_INTERNAL_ERROR, - "Expected row value field count does not match type field count"); - List row = new ArrayList<>(rowBlock.getPositionCount()); - for (int i = 0; i < rowBlock.getPositionCount(); i++) { - row.add(getField(localZone, fieldTypes.get(i), rowBlock, i)); - } - return unmodifiableList(row); + return readBigDecimal(decimalType, block, position).stripTrailingZeros().toPlainString(); } - throw new TrinoException(NOT_SUPPORTED, "unsupported type: " + type); + throw new TrinoException(NOT_SUPPORTED, "Unsupported type for partition: " + type); } public static void checkTableIsWritable(Table table, boolean writesToNonManagedTablesEnabled) @@ -668,130 +458,4 @@ private static boolean isWritablePrimitiveType(PrimitiveCategory primitiveCatego } return false; } - - public static List getRowColumnInspectors(List types) - { - return types.stream() - .map(HiveWriteUtils::getRowColumnInspector) - .collect(toList()); - } - - public static ObjectInspector getRowColumnInspector(Type type) - { - if (type.equals(BOOLEAN)) { - return writableBooleanObjectInspector; - } - - if (type.equals(BIGINT)) { - return writableLongObjectInspector; - } - - if (type.equals(INTEGER)) { - return writableIntObjectInspector; - } - - if (type.equals(SMALLINT)) { - return writableShortObjectInspector; - } - - if (type.equals(TINYINT)) { - return writableByteObjectInspector; - } - - if (type.equals(REAL)) { - return writableFloatObjectInspector; - } - - if (type.equals(DOUBLE)) { - return writableDoubleObjectInspector; - } - - if (type instanceof VarcharType varcharType) { - if (varcharType.isUnbounded()) { - // Unbounded VARCHAR is not supported by Hive. - // Values for such columns must be stored as STRING in Hive - return writableStringObjectInspector; - } - if (varcharType.getBoundedLength() <= MAX_VARCHAR_LENGTH) { - // VARCHAR columns with the length less than or equal to 65535 are supported natively by Hive - return getPrimitiveWritableObjectInspector(getVarcharTypeInfo(varcharType.getBoundedLength())); - } - } - - if (type instanceof CharType charType) { - int charLength = charType.getLength(); - return getPrimitiveWritableObjectInspector(getCharTypeInfo(charLength)); - } - - if (type.equals(VARBINARY)) { - return writableBinaryObjectInspector; - } - - if (type.equals(DATE)) { - return writableDateObjectInspector; - } - - if (type instanceof TimestampType) { - return writableTimestampObjectInspector; - } - - if (type instanceof DecimalType decimalType) { - return getPrimitiveWritableObjectInspector(new DecimalTypeInfo(decimalType.getPrecision(), decimalType.getScale())); - } - - if (isStructuralType(type)) { - return getJavaObjectInspector(type); - } - - throw new IllegalArgumentException("unsupported type: " + type); - } - - public static HiveDecimal getHiveDecimal(DecimalType decimalType, Block block, int position) - { - BigInteger unscaledValue; - if (decimalType.isShort()) { - unscaledValue = BigInteger.valueOf(decimalType.getLong(block, position)); - } - else { - unscaledValue = ((Int128) decimalType.getObject(block, position)).toBigInteger(); - } - return HiveDecimal.create(unscaledValue, decimalType.getScale()); - } - - private static Timestamp getHiveTimestamp(DateTimeZone localZone, TimestampType type, Block block, int position) - { - verify(type.getPrecision() <= HiveTimestampPrecision.MAX.getPrecision(), "Timestamp precision too high for Hive"); - - long epochMicros; - int nanosOfMicro; - if (type.isShort()) { - epochMicros = type.getLong(block, position); - nanosOfMicro = 0; - } - else { - LongTimestamp timestamp = (LongTimestamp) type.getObject(block, position); - epochMicros = timestamp.getEpochMicros(); - nanosOfMicro = timestamp.getPicosOfMicro() / PICOSECONDS_PER_NANOSECOND; - } - - long epochSeconds; - if (DateTimeZone.UTC.equals(localZone)) { - epochSeconds = floorDiv(epochMicros, MICROSECONDS_PER_SECOND); - } - else { - long localEpochMillis = floorDiv(epochMicros, MICROSECONDS_PER_MILLISECOND); - long utcEpochMillis = localZone.convertLocalToUTC(localEpochMillis, false); - epochSeconds = floorDiv(utcEpochMillis, MILLISECONDS_PER_SECOND); - } - - int microsOfSecond = floorMod(epochMicros, MICROSECONDS_PER_SECOND); - int nanosOfSecond = microsOfSecond * NANOSECONDS_PER_MICROSECOND + nanosOfMicro; - return Timestamp.ofEpochSecond(epochSeconds, nanosOfSecond); - } - - private static Timestamp getHiveTimestampTz(Block block, int position) - { - long epochMillis = unpackMillisUtc(block.getLong(position, 0)); - return Timestamp.ofEpochMilli(epochMillis); - } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/InternalHiveSplitFactory.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/InternalHiveSplitFactory.java index 3922f6b3983c6..3395cfd426a9b 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/InternalHiveSplitFactory.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/InternalHiveSplitFactory.java @@ -29,15 +29,10 @@ import io.trino.plugin.hive.orc.OrcPageSourceFactory; import io.trino.plugin.hive.parquet.ParquetPageSourceFactory; import io.trino.plugin.hive.rcfile.RcFilePageSourceFactory; -import io.trino.plugin.hive.s3select.S3SelectPushdown; import io.trino.spi.HostAddress; import io.trino.spi.predicate.Domain; import io.trino.spi.predicate.TupleDomain; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.mapred.FileSplit; -import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.Map; @@ -54,7 +49,6 @@ public class InternalHiveSplitFactory { - private final FileSystem fileSystem; private final String partitionName; private final HiveStorageFormat storageFormat; private final Properties strippedSchema; @@ -67,10 +61,8 @@ public class InternalHiveSplitFactory private final long minimumTargetSplitSizeInBytes; private final Optional maxSplitFileSize; private final boolean forceLocalScheduling; - private final boolean s3SelectPushdownEnabled; public InternalHiveSplitFactory( - FileSystem fileSystem, String partitionName, HiveStorageFormat storageFormat, Properties schema, @@ -82,10 +74,8 @@ public InternalHiveSplitFactory( Optional bucketValidation, DataSize minimumTargetSplitSize, boolean forceLocalScheduling, - boolean s3SelectPushdownEnabled, Optional maxSplitFileSize) { - this.fileSystem = requireNonNull(fileSystem, "fileSystem is null"); this.partitionName = requireNonNull(partitionName, "partitionName is null"); this.storageFormat = requireNonNull(storageFormat, "storageFormat is null"); this.strippedSchema = stripUnnecessaryProperties(requireNonNull(schema, "schema is null")); @@ -96,7 +86,6 @@ public InternalHiveSplitFactory( this.bucketConversion = requireNonNull(bucketConversion, "bucketConversion is null"); this.bucketValidation = requireNonNull(bucketValidation, "bucketValidation is null"); this.forceLocalScheduling = forceLocalScheduling; - this.s3SelectPushdownEnabled = s3SelectPushdownEnabled; this.minimumTargetSplitSizeInBytes = minimumTargetSplitSize.toBytes(); this.maxSplitFileSize = requireNonNull(maxSplitFileSize, "maxSplitFileSize is null"); checkArgument(minimumTargetSplitSizeInBytes > 0, "minimumTargetSplitSize must be > 0, found: %s", minimumTargetSplitSize); @@ -134,23 +123,6 @@ public Optional createInternalHiveSplit(TrinoFileStatus statu acidInfo); } - public Optional createInternalHiveSplit(FileSplit split) - throws IOException - { - FileStatus file = fileSystem.getFileStatus(split.getPath()); - return createInternalHiveSplit( - split.getPath().toString(), - BlockLocation.fromHiveBlockLocations(fileSystem.getFileBlockLocations(file, split.getStart(), split.getLength())), - split.getStart(), - split.getLength(), - file.getLen(), - file.getModificationTime(), - OptionalInt.empty(), - OptionalInt.empty(), - false, - Optional.empty()); - } - private Optional createInternalHiveSplit( String path, List blockLocations, @@ -223,7 +195,6 @@ private Optional createInternalHiveSplit( tableToPartitionMapping, bucketConversion, bucketValidation, - s3SelectPushdownEnabled && S3SelectPushdown.isCompressionCodecSupported(strippedSchema, path), acidInfo, partitionMatchSupplier)); } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/SequenceFileRecordWriter.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/SequenceFileRecordWriter.java deleted file mode 100644 index 34400a63d2e69..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/SequenceFileRecordWriter.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.util; - -import io.trino.plugin.hive.RecordFileWriter.ExtendedRecordWriter; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.SequenceFile.Writer; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Reporter; - -import java.io.Closeable; -import java.io.IOException; - -import static org.apache.hadoop.hive.ql.exec.Utilities.createSequenceWriter; - -public class SequenceFileRecordWriter - implements ExtendedRecordWriter -{ - private long finalWrittenBytes = -1; - private final Writer writer; - private static final Writable EMPTY_KEY = new BytesWritable(); - - public SequenceFileRecordWriter(Path path, JobConf jobConf, Class valueClass, boolean compressed) - throws IOException - { - writer = createSequenceWriter(jobConf, path.getFileSystem(jobConf), path, BytesWritable.class, valueClass, compressed, Reporter.NULL); - } - - @Override - public long getWrittenBytes() - { - if (finalWrittenBytes != -1) { - return finalWrittenBytes; - } - try { - return writer.getLength(); - } - catch (IOException e) { - return 0; // do nothing - } - } - - @Override - public void write(Writable writable) - throws IOException - { - writer.append(EMPTY_KEY, writable); - } - - @Override - public void close(boolean abort) - throws IOException - { - try (Closeable ignored = writer) { - if (finalWrittenBytes == -1) { - writer.hflush(); - finalWrittenBytes = writer.getLength(); - } - } - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/SerdeConstants.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/SerdeConstants.java index 8c2ec8ca21845..e0d993c3cc723 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/SerdeConstants.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/SerdeConstants.java @@ -16,15 +16,10 @@ public final class SerdeConstants { public static final String SERIALIZATION_LIB = "serialization.lib"; - public static final String SERIALIZATION_FORMAT = "serialization.format"; public static final String SERIALIZATION_NULL_FORMAT = "serialization.null.format"; - public static final String SERIALIZATION_LAST_COLUMN_TAKES_REST = "serialization.last.column.takes.rest"; public static final String FIELD_DELIM = "field.delim"; - public static final String COLLECTION_DELIM = "collection.delim"; public static final String LINE_DELIM = "line.delim"; - public static final String MAPKEY_DELIM = "mapkey.delim"; - public static final String QUOTE_CHAR = "quote.delim"; public static final String ESCAPE_CHAR = "escape.delim"; public static final String HEADER_COUNT = "skip.header.line.count"; @@ -34,8 +29,6 @@ public final class SerdeConstants public static final String LIST_COLUMN_TYPES = "columns.types"; public static final String LIST_COLUMN_COMMENTS = "columns.comments"; - public static final String COLUMN_NAME_DELIMITER = "column.name.delimiter"; - public static final String VOID_TYPE_NAME = "void"; public static final String BOOLEAN_TYPE_NAME = "boolean"; public static final String TINYINT_TYPE_NAME = "tinyint"; diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/TextHeaderWriter.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/TextHeaderWriter.java deleted file mode 100644 index 6084c09cadae3..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/TextHeaderWriter.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.util; - -import io.trino.plugin.hive.HiveType; -import io.trino.spi.connector.ConnectorSession; -import io.trino.spi.type.Type; -import io.trino.spi.type.TypeManager; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.Serializer; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; -import org.apache.hadoop.io.BinaryComparable; -import org.apache.hadoop.io.Text; - -import java.io.IOException; -import java.io.OutputStream; -import java.util.List; - -import static com.google.common.collect.ImmutableList.toImmutableList; -import static io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision; -import static java.util.Collections.nCopies; - -public class TextHeaderWriter -{ - private final Serializer serializer; - private final Type headerType; - private final List fileColumnNames; - - public TextHeaderWriter(Serializer serializer, TypeManager typeManager, ConnectorSession session, List fileColumnNames) - { - this.serializer = serializer; - this.fileColumnNames = fileColumnNames; - this.headerType = HiveType.valueOf("string").getType(typeManager, getTimestampPrecision(session)); - } - - public void write(OutputStream compressedOutput, int rowSeparator) - throws IOException - { - try { - ObjectInspector stringObjectInspector = HiveWriteUtils.getRowColumnInspector(headerType); - List headers = fileColumnNames.stream().map(Text::new).collect(toImmutableList()); - List inspectors = nCopies(fileColumnNames.size(), stringObjectInspector); - StandardStructObjectInspector headerStructObjectInspectors = ObjectInspectorFactory.getStandardStructObjectInspector(fileColumnNames, inspectors); - BinaryComparable binary = (BinaryComparable) serializer.serialize(headers, headerStructObjectInspectors); - compressedOutput.write(binary.getBytes(), 0, binary.getLength()); - compressedOutput.write(rowSeparator); - } - catch (SerDeException e) { - throw new IOException(e); - } - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/TextRecordWriter.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/TextRecordWriter.java deleted file mode 100644 index f0fc2acf557d9..0000000000000 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/TextRecordWriter.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.util; - -import io.trino.plugin.hive.RecordFileWriter.ExtendedRecordWriter; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.BinaryComparable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Reporter; - -import java.io.IOException; -import java.io.OutputStream; -import java.util.Optional; -import java.util.Properties; - -import static io.trino.plugin.hive.util.SerdeConstants.LINE_DELIM; -import static java.lang.Integer.parseInt; -import static org.apache.hadoop.hive.ql.exec.Utilities.createCompressedStream; - -public class TextRecordWriter - implements ExtendedRecordWriter -{ - private final FSDataOutputStream output; - private final OutputStream compressedOutput; - private final int rowSeparator; - - public TextRecordWriter(Path path, JobConf jobConf, Properties properties, boolean isCompressed, Optional textHeaderWriter) - throws IOException - { - String rowSeparatorString = properties.getProperty(LINE_DELIM, "\n"); - // same logic as HiveIgnoreKeyTextOutputFormat - int rowSeparatorByte; - try { - rowSeparatorByte = Byte.parseByte(rowSeparatorString); - } - catch (NumberFormatException e) { - rowSeparatorByte = rowSeparatorString.charAt(0); - } - rowSeparator = rowSeparatorByte; - output = path.getFileSystem(jobConf).create(path, Reporter.NULL); - compressedOutput = createCompressedStream(jobConf, output, isCompressed); - - Optional skipHeaderLine = Optional.ofNullable(properties.getProperty("skip.header.line.count")); - if (skipHeaderLine.isPresent()) { - if (parseInt(skipHeaderLine.get()) == 1) { - textHeaderWriter - .orElseThrow(() -> new IllegalArgumentException("TextHeaderWriter must not be empty when skip.header.line.count is set to 1")) - .write(compressedOutput, rowSeparator); - } - } - } - - @Override - public long getWrittenBytes() - { - return output.getPos(); - } - - @Override - public void write(Writable writable) - throws IOException - { - BinaryComparable binary = (BinaryComparable) writable; - compressedOutput.write(binary.getBytes(), 0, binary.getLength()); - compressedOutput.write(rowSeparator); - } - - @Override - public void close(boolean abort) - throws IOException - { - compressedOutput.close(); - } -} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHive.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHive.java index c3efe779a3c5b..c9c8cd9605b6f 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHive.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHive.java @@ -100,8 +100,6 @@ import io.trino.spi.connector.DiscretePredicates; import io.trino.spi.connector.DynamicFilter; import io.trino.spi.connector.ProjectionApplicationResult; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.connector.RecordPageSource; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.connector.SchemaTablePrefix; import io.trino.spi.connector.SortingProperty; @@ -248,7 +246,6 @@ import static io.trino.plugin.hive.HiveTestUtils.arrayType; import static io.trino.plugin.hive.HiveTestUtils.getDefaultHiveFileWriterFactories; import static io.trino.plugin.hive.HiveTestUtils.getDefaultHivePageSourceFactories; -import static io.trino.plugin.hive.HiveTestUtils.getDefaultHiveRecordCursorProviders; import static io.trino.plugin.hive.HiveTestUtils.getHiveSession; import static io.trino.plugin.hive.HiveTestUtils.getHiveSessionProperties; import static io.trino.plugin.hive.HiveTestUtils.getTypes; @@ -911,7 +908,6 @@ public Optional getMaterializedView(Connect partitionManager, new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), new HdfsNamenodeStats(), - hdfsEnvironment, executor, new CounterStat(), 100, @@ -927,7 +923,6 @@ public Optional getMaterializedView(Connect pageSinkProvider = new HivePageSinkProvider( getDefaultHiveFileWriterFactories(hiveConfig, hdfsEnvironment), new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), - hdfsEnvironment, PAGE_SORTER, HiveMetastoreFactory.ofInstance(metastoreClient), new GroupByHashPageIndexerFactory(JOIN_COMPILER, TYPE_OPERATORS), @@ -942,11 +937,8 @@ public Optional getMaterializedView(Connect new HiveWriterStats()); pageSourceProvider = new HivePageSourceProvider( TESTING_TYPE_MANAGER, - hdfsEnvironment, hiveConfig, - getDefaultHivePageSourceFactories(hdfsEnvironment, hiveConfig), - getDefaultHiveRecordCursorProviders(hiveConfig, hdfsEnvironment), - new GenericHiveRecordCursorProvider(hdfsEnvironment, hiveConfig)); + getDefaultHivePageSourceFactories(hdfsEnvironment, hiveConfig)); nodePartitioningProvider = new HiveNodePartitioningProvider( new TestingNodeManager("fake-environment"), TESTING_TYPE_MANAGER); @@ -5449,22 +5441,7 @@ protected String getPartitionId(Object partition) protected static void assertPageSourceType(ConnectorPageSource pageSource, HiveStorageFormat hiveStorageFormat) { - if (pageSource instanceof RecordPageSource) { - RecordCursor hiveRecordCursor = ((RecordPageSource) pageSource).getCursor(); - hiveRecordCursor = ((HiveRecordCursor) hiveRecordCursor).getRegularColumnRecordCursor(); - if (hiveRecordCursor instanceof HiveBucketValidationRecordCursor) { - hiveRecordCursor = ((HiveBucketValidationRecordCursor) hiveRecordCursor).delegate(); - } - assertInstanceOf(hiveRecordCursor, recordCursorType(), hiveStorageFormat.name()); - } - else { - assertInstanceOf(((HivePageSource) pageSource).getPageSource(), pageSourceType(hiveStorageFormat), hiveStorageFormat.name()); - } - } - - private static Class recordCursorType() - { - return GenericHiveRecordCursor.class; + assertInstanceOf(((HivePageSource) pageSource).getPageSource(), pageSourceType(hiveStorageFormat), hiveStorageFormat.name()); } private static Class pageSourceType(HiveStorageFormat hiveStorageFormat) diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileFormats.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileFormats.java index 8cd5bcdeb3580..cf6927ba0c7c0 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileFormats.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileFormats.java @@ -21,7 +21,6 @@ import io.trino.plugin.hive.metastore.StorageFormat; import io.trino.spi.Page; import io.trino.spi.PageBuilder; -import io.trino.spi.block.Block; import io.trino.spi.block.BlockBuilder; import io.trino.spi.connector.ConnectorPageSource; import io.trino.spi.connector.ConnectorSession; @@ -67,7 +66,6 @@ import java.io.File; import java.io.IOException; -import java.lang.invoke.MethodHandle; import java.math.BigDecimal; import java.math.BigInteger; import java.util.ArrayList; @@ -83,7 +81,6 @@ import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableList.toImmutableList; -import static com.google.common.collect.ImmutableMap.toImmutableMap; import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; @@ -91,7 +88,6 @@ import static io.trino.plugin.hive.HiveColumnProjectionInfo.generatePartialName; import static io.trino.plugin.hive.HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION; import static io.trino.plugin.hive.HiveTestUtils.SESSION; -import static io.trino.plugin.hive.HiveTestUtils.isDistinctFrom; import static io.trino.plugin.hive.HiveTestUtils.mapType; import static io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION; import static io.trino.plugin.hive.util.CompressionConfigUtil.configureCompression; @@ -123,7 +119,6 @@ import static java.nio.charset.StandardCharsets.UTF_8; import static java.util.Arrays.fill; import static java.util.Objects.requireNonNull; -import static java.util.function.Function.identity; import static java.util.stream.Collectors.toList; import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardListObjectInspector; import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardMapObjectInspector; @@ -143,8 +138,6 @@ import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getCharTypeInfo; import static org.joda.time.DateTimeZone.UTC; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; public abstract class AbstractTestHiveFileFormats { @@ -727,59 +720,6 @@ public static Object getFieldFromCursor(RecordCursor cursor, Type type, int fiel throw new RuntimeException("unknown type"); } - protected void checkCursor(RecordCursor cursor, List testColumns, int rowCount) - { - List types = testColumns.stream() - .map(column -> column.getObjectInspector().getTypeName()) - .map(type -> HiveType.valueOf(type).getType(TESTING_TYPE_MANAGER)) - .collect(toImmutableList()); - - Map distinctFromOperators = types.stream().distinct() - .collect(toImmutableMap(identity(), HiveTestUtils::distinctFromOperator)); - - for (int row = 0; row < rowCount; row++) { - assertTrue(cursor.advanceNextPosition()); - for (int i = 0, testColumnsSize = testColumns.size(); i < testColumnsSize; i++) { - TestColumn testColumn = testColumns.get(i); - - Type type = types.get(i); - Object fieldFromCursor = getFieldFromCursor(cursor, type, i); - if (fieldFromCursor == null) { - assertEquals(null, testColumn.getExpectedValue(), "Expected null for column " + testColumn.getName()); - } - else if (type instanceof DecimalType decimalType) { - fieldFromCursor = new BigDecimal((BigInteger) fieldFromCursor, decimalType.getScale()); - assertEquals(fieldFromCursor, testColumn.getExpectedValue(), "Wrong value for column " + testColumn.getName()); - } - else if (testColumn.getObjectInspector().getTypeName().equals("float")) { - assertEquals((float) fieldFromCursor, (float) testColumn.getExpectedValue(), (float) EPSILON); - } - else if (testColumn.getObjectInspector().getTypeName().equals("double")) { - assertEquals((double) fieldFromCursor, (double) testColumn.getExpectedValue(), EPSILON); - } - else if (testColumn.getObjectInspector().getTypeName().equals("tinyint")) { - assertEquals(((Number) fieldFromCursor).byteValue(), testColumn.getExpectedValue()); - } - else if (testColumn.getObjectInspector().getTypeName().equals("smallint")) { - assertEquals(((Number) fieldFromCursor).shortValue(), testColumn.getExpectedValue()); - } - else if (testColumn.getObjectInspector().getTypeName().equals("int")) { - assertEquals(((Number) fieldFromCursor).intValue(), testColumn.getExpectedValue()); - } - else if (testColumn.getObjectInspector().getCategory() == Category.PRIMITIVE) { - assertEquals(fieldFromCursor, testColumn.getExpectedValue(), "Wrong value for column " + testColumn.getName()); - } - else { - Block expected = (Block) testColumn.getExpectedValue(); - Block actual = (Block) fieldFromCursor; - boolean distinct = isDistinctFrom(distinctFromOperators.get(type), expected, actual); - assertFalse(distinct, "Wrong value for column: " + testColumn.getName()); - } - } - } - assertFalse(cursor.advanceNextPosition()); - } - protected void checkPageSource(ConnectorPageSource pageSource, List testColumns, List types, int rowCount) throws IOException { diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystem.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystem.java index 63e70d2fcf422..ea77cb7c90854 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystem.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AbstractTestHiveFileSystem.java @@ -113,7 +113,6 @@ import static io.trino.plugin.hive.HiveTestUtils.SESSION; import static io.trino.plugin.hive.HiveTestUtils.getDefaultHiveFileWriterFactories; import static io.trino.plugin.hive.HiveTestUtils.getDefaultHivePageSourceFactories; -import static io.trino.plugin.hive.HiveTestUtils.getDefaultHiveRecordCursorProviders; import static io.trino.plugin.hive.HiveTestUtils.getHiveSessionProperties; import static io.trino.plugin.hive.HiveTestUtils.getTypes; import static io.trino.plugin.hive.HiveType.HIVE_LONG; @@ -186,7 +185,7 @@ public void tearDown() protected void onSetupComplete() {} - protected void setup(String host, int port, String databaseName, boolean s3SelectPushdownEnabled, HdfsConfiguration hdfsConfiguration) + protected void setup(String host, int port, String databaseName, HdfsConfiguration hdfsConfiguration) { database = databaseName; table = new SchemaTableName(database, "trino_test_external_fs"); @@ -198,8 +197,7 @@ protected void setup(String host, int port, String databaseName, boolean s3Selec temporaryCreateTableWithExternalLocation = new SchemaTableName(database, "tmp_trino_test_create_external" + random); config = new HiveConfig() - .setWritesToNonManagedTablesEnabled(true) - .setS3SelectPushdownEnabled(s3SelectPushdownEnabled); + .setWritesToNonManagedTablesEnabled(true); HivePartitionManager hivePartitionManager = new HivePartitionManager(config); @@ -247,7 +245,6 @@ protected void setup(String host, int port, String databaseName, boolean s3Selec hivePartitionManager, new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), new HdfsNamenodeStats(), - hdfsEnvironment, new BoundedExecutor(executor, config.getMaxSplitIteratorThreads()), new CounterStat(), config.getMaxOutstandingSplits(), @@ -264,7 +261,6 @@ protected void setup(String host, int port, String databaseName, boolean s3Selec pageSinkProvider = new HivePageSinkProvider( getDefaultHiveFileWriterFactories(config, hdfsEnvironment), new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), - hdfsEnvironment, PAGE_SORTER, HiveMetastoreFactory.ofInstance(metastoreClient), new GroupByHashPageIndexerFactory(new JoinCompiler(typeOperators), typeOperators), @@ -279,11 +275,8 @@ protected void setup(String host, int port, String databaseName, boolean s3Selec new HiveWriterStats()); pageSourceProvider = new HivePageSourceProvider( TESTING_TYPE_MANAGER, - hdfsEnvironment, config, - getDefaultHivePageSourceFactories(hdfsEnvironment, config), - getDefaultHiveRecordCursorProviders(config, hdfsEnvironment), - new GenericHiveRecordCursorProvider(hdfsEnvironment, config)); + getDefaultHivePageSourceFactories(hdfsEnvironment, config)); onSetupComplete(); } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseHiveConnectorTest.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseHiveConnectorTest.java index 7b63f8e50eda5..eff06ada51bcd 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseHiveConnectorTest.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseHiveConnectorTest.java @@ -8789,24 +8789,7 @@ private List getAllTestingHiveStorageFormat() continue; } - Session defaultSession = getSession(); - String catalogName = defaultSession.getCatalog().orElseThrow(); - for (boolean enabled : List.of(true, false)) { - Session session = Session.builder(defaultSession) - .setCatalogSessionProperty(catalogName, "avro_native_reader_enabled", Boolean.toString(enabled)) - .setCatalogSessionProperty(catalogName, "avro_native_writer_enabled", Boolean.toString(enabled)) - .setCatalogSessionProperty(catalogName, "csv_native_reader_enabled", Boolean.toString(enabled)) - .setCatalogSessionProperty(catalogName, "csv_native_writer_enabled", Boolean.toString(enabled)) - .setCatalogSessionProperty(catalogName, "json_native_reader_enabled", Boolean.toString(enabled)) - .setCatalogSessionProperty(catalogName, "json_native_writer_enabled", Boolean.toString(enabled)) - .setCatalogSessionProperty(catalogName, "regex_native_reader_enabled", Boolean.toString(enabled)) - .setCatalogSessionProperty(catalogName, "text_file_native_reader_enabled", Boolean.toString(enabled)) - .setCatalogSessionProperty(catalogName, "text_file_native_writer_enabled", Boolean.toString(enabled)) - .setCatalogSessionProperty(catalogName, "sequence_file_native_reader_enabled", Boolean.toString(enabled)) - .setCatalogSessionProperty(catalogName, "sequence_file_native_writer_enabled", Boolean.toString(enabled)) - .build(); - formats.add(new TestingHiveStorageFormat(session, hiveStorageFormat)); - } + formats.add(new TestingHiveStorageFormat(getSession(), hiveStorageFormat)); } return formats.build(); } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseTestHiveOnDataLake.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseTestHiveOnDataLake.java index da982853aab9c..4fa61fef34efa 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseTestHiveOnDataLake.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseTestHiveOnDataLake.java @@ -29,11 +29,8 @@ import io.trino.spi.predicate.NullableValue; import io.trino.spi.predicate.TupleDomain; import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.MaterializedResult; import io.trino.testing.QueryRunner; import io.trino.testing.minio.MinioClient; -import io.trino.testing.sql.TestTable; -import org.intellij.lang.annotations.Language; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -67,7 +64,6 @@ import static java.util.stream.Collectors.joining; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.testng.Assert.assertEquals; public abstract class BaseTestHiveOnDataLake extends AbstractTestQueryFramework @@ -110,7 +106,6 @@ protected QueryRunner createQueryRunner() .put("hive.s3.streaming.part-size", HIVE_S3_STREAMING_PART_SIZE.toString()) // This is required to enable AWS Athena partition projection .put("hive.partition-projection-enabled", "true") - .put("hive.s3select-pushdown.experimental-textfile-pushdown-enabled", "true") .buildOrThrow()) .build(); } @@ -1793,249 +1788,6 @@ public void testPartitionedTableExternalLocationOnTopOfTheBucket() assertUpdate("DROP TABLE " + tableName); } - @Test(dataProvider = "s3SelectFileFormats") - public void testS3SelectPushdown(String tableProperties) - { - Session usingAppendInserts = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "insert_existing_partitions_behavior", "APPEND") - .build(); - List values = ImmutableList.of( - "1, true, 11, 111, 1111, 11111, 'one', DATE '2020-01-01'", - "2, true, 22, 222, 2222, 22222, 'two', DATE '2020-02-02'", - "3, NULL, NULL, NULL, NULL, NULL, NULL, NULL", - "4, false, 44, 444, 4444, 44444, 'four', DATE '2020-04-04'"); - try (TestTable table = new TestTable( - sql -> getQueryRunner().execute(usingAppendInserts, sql), - "hive.%s.test_s3_select_pushdown".formatted(HIVE_TEST_SCHEMA), - "(id INT, bool_t BOOLEAN, tiny_t TINYINT, small_t SMALLINT, int_t INT, big_t BIGINT, string_t VARCHAR, date_t DATE) " + - "WITH (" + tableProperties + ")", values)) { - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t = true", "VALUES 1, 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t = false", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t = 22", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t != 22", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t > 22", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t >= 22", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t = 22 OR tiny_t = 44", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NULL OR tiny_t >= 22", "VALUES 2, 3, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t = 222", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t != 222", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t > 222", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t >= 222", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t = 222 OR small_t = 444", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NULL OR small_t >= 222", "VALUES 2, 3, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t = 2222", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t != 2222", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t > 2222", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t >= 2222", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t = 2222 OR int_t = 4444", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NULL OR int_t >= 2222", "VALUES 2, 3, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t = 22222", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t != 22222", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t > 22222", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t >= 22222", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t = 22222 OR big_t = 44444", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NULL OR big_t >= 22222", "VALUES 2, 3, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t = 'two'", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t != 'two'", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t < 'two'", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t <= 'two'", "VALUES 1, 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t = 'two' OR string_t = 'four'", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NULL OR string_t >= 'two'", "VALUES 2, 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t = DATE '2020-02-02'", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t != DATE '2020-02-02'", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t > DATE '2020-02-02'", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t <= DATE '2020-02-02'", "VALUES 1, 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t = DATE '2020-02-02' OR date_t = DATE '2020-04-04'", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NULL OR date_t >= DATE '2020-02-02'", "VALUES 2, 3, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NOT NULL", "VALUES 1, 2, 4"); - } - } - - @Test(dataProvider = "s3SelectFileFormats") - public void testS3SelectOnDecimalColumnIsDisabled(String tableProperties) - { - Session usingAppendInserts = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "insert_existing_partitions_behavior", "APPEND") - .build(); - List values = ImmutableList.of("1, 1.1", "2, 2.2", "3, NULL", "4, 4.4"); - try (TestTable table = new TestTable( - sql -> getQueryRunner().execute(usingAppendInserts, sql), - "hive.%s.test_s3_select_pushdown".formatted(HIVE_TEST_SCHEMA), - "(id INT, decimal_t DECIMAL(10, 5)) WITH (" + tableProperties + ")", - values)) { - assertNoS3SelectPushdown("SELECT id FROM " + table.getName() + " WHERE decimal_t = 2.2", "VALUES 2"); - assertNoS3SelectPushdown("SELECT id FROM " + table.getName() + " WHERE decimal_t != 2.2", "VALUES 1, 4"); - assertNoS3SelectPushdown("SELECT id FROM " + table.getName() + " WHERE decimal_t < 2.2", "VALUES 1"); - assertNoS3SelectPushdown("SELECT id FROM " + table.getName() + " WHERE decimal_t <= 2.2", "VALUES 1, 2"); - assertNoS3SelectPushdown("SELECT id FROM " + table.getName() + " WHERE decimal_t = 2.2 OR decimal_t = 4.4", "VALUES 2, 4"); - assertNoS3SelectPushdown("SELECT id FROM " + table.getName() + " WHERE decimal_t IS NULL OR decimal_t >= 2.2", "VALUES 2, 3, 4"); - assertNoS3SelectPushdown("SELECT id FROM " + table.getName() + " WHERE decimal_t IS NULL", "VALUES 3"); - assertNoS3SelectPushdown("SELECT id FROM " + table.getName() + " WHERE decimal_t IS NOT NULL", "VALUES 1, 2, 4"); - } - } - - @Test - public void testJsonS3SelectPushdownWithSpecialCharacters() - { - Session usingAppendInserts = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "insert_existing_partitions_behavior", "APPEND") - .build(); - - List specialCharacterValues = ImmutableList.of( - "1, 'a,comma'", - "2, 'a|pipe'", - "3, 'an''escaped quote'", - "4, 'a\"double quote'"); - try (TestTable table = new TestTable( - sql -> getQueryRunner().execute(usingAppendInserts, sql), - "hive.%s.test_s3_select_pushdown_special_characters".formatted(HIVE_TEST_SCHEMA), - "(id INT, string_t VARCHAR) WITH (format = 'JSON')", - specialCharacterValues)) { - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t ='a,comma'", "VALUES 1"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t ='a|pipe'", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t ='an''escaped quote'", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t ='a\"double quote'", "VALUES 4"); - } - } - - @Test - public void testS3SelectExperimentalPushdown() - { - // Demonstrate correctness issues which have resulted in pushdown for TEXTFILE - // using CSV support in S3 Select being put behind a separate "experimental" flag. - // TODO: https://github.com/trinodb/trino/issues/17775 - Session usingAppendInserts = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "insert_existing_partitions_behavior", "APPEND") - .build(); - List values = ImmutableList.of( - "1, true, 11", - "2, true, 22", - "3, NULL, NULL", - "4, false, 44"); - Session withS3SelectPushdown = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "s3_select_pushdown_enabled", "true") - .setCatalogSessionProperty("hive", "json_native_reader_enabled", "false") - .setCatalogSessionProperty("hive", "text_file_native_reader_enabled", "false") - .build(); - - Session withoutS3SelectPushdown = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "json_native_reader_enabled", "false") - .setCatalogSessionProperty("hive", "text_file_native_reader_enabled", "false") - .build(); - - try (TestTable table = new TestTable( - sql -> getQueryRunner().execute(usingAppendInserts, sql), - "hive.%s.test_s3_select_pushdown_experimental_features".formatted(HIVE_TEST_SCHEMA), - "(id INT, bool_t BOOLEAN, int_t INT) WITH (format = 'TEXTFILE')", - values)) { - assertQuery(withoutS3SelectPushdown, "SELECT id FROM " + table.getName() + " WHERE int_t IS NULL", "VALUES 3"); - assertThat(query(withS3SelectPushdown, "SELECT id FROM " + table.getName() + " WHERE int_t IS NULL")).returnsEmptyResult(); - - assertQueryFails( - withS3SelectPushdown, - "SELECT id FROM " + table.getName() + " WHERE bool_t = true", - "S3 returned an error: Error casting:.*"); - } - - List specialCharacterValues = ImmutableList.of( - "1, 'a,comma'", - "2, 'a|pipe'", - "3, 'an''escaped quote'", - "4, 'a~null encoding'"); - try (TestTable table = new TestTable( - sql -> getQueryRunner().execute(usingAppendInserts, sql), - "hive.%s.test_s3_select_pushdown_special_characters".formatted(HIVE_TEST_SCHEMA), - "(id INT, string_t VARCHAR) WITH (format = 'TEXTFILE', textfile_field_separator=',', textfile_field_separator_escape='|', null_format='~')", - specialCharacterValues)) { - // These two should return a result, but incorrectly return nothing - String selectWithComma = "SELECT id FROM " + table.getName() + " WHERE string_t ='a,comma'"; - assertQuery(withoutS3SelectPushdown, selectWithComma, "VALUES 1"); - assertThat(query(withS3SelectPushdown, selectWithComma)).returnsEmptyResult(); - - String selectWithPipe = "SELECT id FROM " + table.getName() + " WHERE string_t ='a|pipe'"; - assertQuery(withoutS3SelectPushdown, selectWithPipe, "VALUES 2"); - assertThat(query(withS3SelectPushdown, selectWithPipe)).returnsEmptyResult(); - - // These two are actually correct - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t ='an''escaped quote'", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t ='a~null encoding'", "VALUES 4"); - } - } - - private void assertS3SelectQuery(@Language("SQL") String query, @Language("SQL") String expectedValues) - { - Session withS3SelectPushdown = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "s3_select_pushdown_enabled", "true") - .setCatalogSessionProperty("hive", "json_native_reader_enabled", "false") - .setCatalogSessionProperty("hive", "text_file_native_reader_enabled", "false") - .build(); - - MaterializedResult expectedResult = computeActual(expectedValues); - assertQueryStats( - withS3SelectPushdown, - query, - statsWithPushdown -> { - long inputPositionsWithPushdown = statsWithPushdown.getPhysicalInputPositions(); - assertQueryStats( - getSession(), - query, - statsWithoutPushdown -> assertThat(statsWithoutPushdown.getPhysicalInputPositions()).isGreaterThan(inputPositionsWithPushdown), - results -> assertEquals(results.getOnlyColumnAsSet(), expectedResult.getOnlyColumnAsSet())); - }, - results -> assertEquals(results.getOnlyColumnAsSet(), expectedResult.getOnlyColumnAsSet())); - } - - private void assertNoS3SelectPushdown(@Language("SQL") String query, @Language("SQL") String expectedValues) - { - Session withS3SelectPushdown = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "s3_select_pushdown_enabled", "true") - .setCatalogSessionProperty("hive", "json_native_reader_enabled", "false") - .setCatalogSessionProperty("hive", "text_file_native_reader_enabled", "false") - .build(); - - MaterializedResult expectedResult = computeActual(expectedValues); - assertQueryStats( - withS3SelectPushdown, - query, - statsWithPushdown -> { - long inputPositionsWithPushdown = statsWithPushdown.getPhysicalInputPositions(); - assertQueryStats( - getSession(), - query, - statsWithoutPushdown -> assertThat(statsWithoutPushdown.getPhysicalInputPositions()).isEqualTo(inputPositionsWithPushdown), - results -> assertEquals(results.getOnlyColumnAsSet(), expectedResult.getOnlyColumnAsSet())); - }, - results -> assertEquals(results.getOnlyColumnAsSet(), expectedResult.getOnlyColumnAsSet())); - } - - @DataProvider - public static Object[][] s3SelectFileFormats() - { - return new Object[][] { - {"format = 'JSON'"}, - {"format = 'TEXTFILE', textfile_field_separator=',', textfile_field_separator_escape='|', null_format='~'"} - }; - } - @Test public void testDropStatsPartitionedTable() { diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java index e4c5961e1a3fd..882dc74e34c22 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/HiveTestUtils.java @@ -17,7 +17,6 @@ import com.google.common.collect.ImmutableSet; import com.google.common.net.HostAndPort; import io.airlift.slice.Slices; -import io.airlift.units.DataSize; import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.filesystem.hdfs.HdfsFileSystemFactory; import io.trino.hdfs.DynamicHdfsConfiguration; @@ -57,8 +56,6 @@ import io.trino.plugin.hive.parquet.ParquetReaderConfig; import io.trino.plugin.hive.parquet.ParquetWriterConfig; import io.trino.plugin.hive.rcfile.RcFilePageSourceFactory; -import io.trino.plugin.hive.s3select.S3SelectRecordCursorProvider; -import io.trino.plugin.hive.s3select.TrinoS3ClientFactory; import io.trino.spi.PageSorter; import io.trino.spi.block.Block; import io.trino.spi.connector.ColumnHandle; @@ -97,7 +94,6 @@ import java.util.UUID; import static com.google.common.collect.ImmutableList.toImmutableList; -import static io.airlift.units.DataSize.Unit.MEGABYTE; import static io.trino.spi.block.ArrayValueBuilder.buildArrayValue; import static io.trino.spi.block.MapValueBuilder.buildMapValue; import static io.trino.spi.block.RowValueBuilder.buildRowValue; @@ -165,7 +161,6 @@ public static HiveSessionProperties getHiveSessionProperties(HiveConfig hiveConf { return new HiveSessionProperties( hiveConfig, - new HiveFormatsConfig(), orcReaderConfig, new OrcWriterConfig(), new ParquetReaderConfig(), @@ -176,7 +171,6 @@ public static HiveSessionProperties getHiveSessionProperties(HiveConfig hiveConf { return new HiveSessionProperties( hiveConfig, - new HiveFormatsConfig(), new OrcReaderConfig(), new OrcWriterConfig(), new ParquetReaderConfig(), @@ -187,7 +181,6 @@ public static HiveSessionProperties getHiveSessionProperties(HiveConfig hiveConf { return new HiveSessionProperties( hiveConfig, - new HiveFormatsConfig(), new OrcReaderConfig(), new OrcWriterConfig(), parquetReaderConfig, @@ -212,11 +205,6 @@ public static Set getDefaultHivePageSourceFactories(HdfsE .build(); } - public static Set getDefaultHiveRecordCursorProviders(HiveConfig hiveConfig, HdfsEnvironment hdfsEnvironment) - { - return ImmutableSet.of(new S3SelectRecordCursorProvider(hdfsEnvironment, new TrinoS3ClientFactory(hiveConfig), hiveConfig)); - } - public static Set getDefaultHiveFileWriterFactories(HiveConfig hiveConfig, HdfsEnvironment hdfsEnvironment) { TrinoFileSystemFactory fileSystemFactory = new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS); @@ -244,11 +232,6 @@ public static List getTypes(List columnHandles) return types.build(); } - public static HiveRecordCursorProvider createGenericHiveRecordCursorProvider(HdfsEnvironment hdfsEnvironment) - { - return new GenericHiveRecordCursorProvider(hdfsEnvironment, DataSize.of(100, MEGABYTE)); - } - public static MapType mapType(Type keyType, Type valueType) { return (MapType) TESTING_TYPE_MANAGER.getParameterizedType(StandardTypes.MAP, ImmutableList.of( diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestBackgroundHiveSplitLoader.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestBackgroundHiveSplitLoader.java index 9e65561e82bed..35ed1caaebea7 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestBackgroundHiveSplitLoader.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestBackgroundHiveSplitLoader.java @@ -22,6 +22,7 @@ import io.airlift.stats.CounterStat; import io.airlift.units.DataSize; import io.airlift.units.Duration; +import io.trino.filesystem.Location; import io.trino.filesystem.hdfs.HdfsFileSystemFactory; import io.trino.hdfs.DynamicHdfsConfiguration; import io.trino.hdfs.HdfsConfig; @@ -37,6 +38,7 @@ import io.trino.plugin.hive.metastore.StorageFormat; import io.trino.plugin.hive.metastore.Table; import io.trino.plugin.hive.util.HiveBucketing.HiveBucketFilter; +import io.trino.plugin.hive.util.InternalHiveSplitFactory; import io.trino.plugin.hive.util.ValidWriteIdList; import io.trino.spi.TrinoException; import io.trino.spi.connector.ColumnHandle; @@ -482,14 +484,12 @@ public HivePartitionMetadata next() createBucketSplitInfo(Optional.empty(), Optional.empty()), SESSION, new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), - hdfsEnvironment, new HdfsNamenodeStats(), new CachingDirectoryLister(new HiveConfig()), executor, threads, false, false, - true, Optional.empty(), Optional.empty(), 100); @@ -796,37 +796,41 @@ public void testValidateFileBuckets() @Test public void testBuildManifestFileIterator() - throws Exception { CachingDirectoryLister directoryLister = new CachingDirectoryLister(new Duration(0, TimeUnit.MINUTES), DataSize.ofBytes(0), ImmutableList.of()); Properties schema = new Properties(); schema.setProperty(FILE_INPUT_FORMAT, SymlinkTextInputFormat.class.getName()); schema.setProperty(SERIALIZATION_LIB, AVRO.getSerde()); - Path firstFilePath = new Path("hdfs://VOL1:9000/db_name/table_name/file1"); - Path secondFilePath = new Path("hdfs://VOL1:9000/db_name/table_name/file2"); - List paths = ImmutableList.of(firstFilePath, secondFilePath); - List files = paths.stream() + Location firstFilePath = Location.of("hdfs://VOL1:9000/db_name/table_name/file1"); + Location secondFilePath = Location.of("hdfs://VOL1:9000/db_name/table_name/file2"); + List locations = ImmutableList.of(firstFilePath, secondFilePath); + List files = locations.stream() .map(TestBackgroundHiveSplitLoader::locatedFileStatus) .collect(toImmutableList()); - BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( - files, - directoryLister); - Optional> splitIterator = backgroundHiveSplitLoader.buildManifestFileIterator( - AVRO, + InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory( "partition", + AVRO, schema, ImmutableList.of(), TupleDomain.all(), () -> true, - false, TableToPartitionMapping.empty(), - new Path("hdfs://VOL1:9000/db_name/table_name"), - paths, + Optional.empty(), + Optional.empty(), + DataSize.of(512, MEGABYTE), + false, + Optional.empty()); + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + files, + directoryLister); + Iterator splitIterator = backgroundHiveSplitLoader.buildManifestFileIterator( + splitFactory, + Location.of("hdfs://VOL1:9000/db_name/table_name"), + locations, true); - assertTrue(splitIterator.isPresent()); - List splits = ImmutableList.copyOf(splitIterator.get()); + List splits = ImmutableList.copyOf(splitIterator); assertEquals(splits.size(), 2); assertEquals(splits.get(0).getPath(), firstFilePath.toString()); assertEquals(splits.get(1).getPath(), secondFilePath.toString()); @@ -834,36 +838,45 @@ public void testBuildManifestFileIterator() @Test public void testBuildManifestFileIteratorNestedDirectory() - throws Exception { CachingDirectoryLister directoryLister = new CachingDirectoryLister(new Duration(5, TimeUnit.MINUTES), DataSize.of(100, KILOBYTE), ImmutableList.of()); Properties schema = new Properties(); schema.setProperty(FILE_INPUT_FORMAT, SymlinkTextInputFormat.class.getName()); schema.setProperty(SERIALIZATION_LIB, AVRO.getSerde()); - Path filePath = new Path("hdfs://VOL1:9000/db_name/table_name/file1"); - Path directoryPath = new Path("hdfs://VOL1:9000/db_name/table_name/dir/file2"); - List paths = ImmutableList.of(filePath, directoryPath); + Location filePath = Location.of("hdfs://VOL1:9000/db_name/table_name/file1"); + Location directoryPath = Location.of("hdfs://VOL1:9000/db_name/table_name/dir/file2"); + List locations = ImmutableList.of(filePath, directoryPath); List files = ImmutableList.of( locatedFileStatus(filePath), locatedFileStatus(directoryPath)); - BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( - files, - directoryLister); - Optional> splitIterator = backgroundHiveSplitLoader.buildManifestFileIterator( - AVRO, + InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory( "partition", + AVRO, schema, ImmutableList.of(), TupleDomain.all(), () -> true, - false, TableToPartitionMapping.empty(), - new Path("hdfs://VOL1:9000/db_name/table_name"), - paths, + Optional.empty(), + Optional.empty(), + DataSize.of(512, MEGABYTE), + false, + Optional.empty()); + + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + files, + directoryLister); + Iterator splitIterator = backgroundHiveSplitLoader.buildManifestFileIterator( + splitFactory, + Location.of("hdfs://VOL1:9000/db_name/table_name"), + locations, false); - assertTrue(splitIterator.isEmpty()); + List splits = ImmutableList.copyOf(splitIterator); + assertEquals(splits.size(), 2); + assertEquals(splits.get(0).getPath(), filePath.toString()); + assertEquals(splits.get(1).getPath(), directoryPath.toString()); } @Test @@ -1104,14 +1117,12 @@ private BackgroundHiveSplitLoader backgroundHiveSplitLoader( createBucketSplitInfo(bucketHandle, hiveBucketFilter), SESSION, new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), - hdfsEnvironment, new HdfsNamenodeStats(), new CachingDirectoryLister(new HiveConfig()), executor, 2, false, false, - true, validWriteIds, Optional.empty(), 100); @@ -1149,14 +1160,12 @@ private BackgroundHiveSplitLoader backgroundHiveSplitLoader( Optional.empty(), connectorSession, new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), - hdfsEnvironment, new HdfsNamenodeStats(), directoryLister, executor, 2, false, false, - true, Optional.empty(), Optional.empty(), maxPartitions); @@ -1178,14 +1187,12 @@ private BackgroundHiveSplitLoader backgroundHiveSplitLoaderOfflinePartitions() createBucketSplitInfo(Optional.empty(), Optional.empty()), connectorSession, new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), - hdfsEnvironment, new HdfsNamenodeStats(), new CachingDirectoryLister(new HiveConfig()), executor, 2, false, false, - true, Optional.empty(), Optional.empty(), 100); @@ -1297,6 +1304,11 @@ private static Table table( .build(); } + private static LocatedFileStatus locatedFileStatus(Location location) + { + return locatedFileStatus(new Path(location.toString()), 10); + } + private static LocatedFileStatus locatedFileStatus(Path path) { return locatedFileStatus(path, 10); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveConfig.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveConfig.java index de8264a4cbc88..674d5a5352e3c 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveConfig.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveConfig.java @@ -91,9 +91,6 @@ public void testDefaults() .setPartitionStatisticsSampleSize(100) .setIgnoreCorruptedStatistics(false) .setCollectColumnStatisticsOnWrite(true) - .setS3SelectPushdownEnabled(false) - .setS3SelectExperimentalPushdownEnabled(false) - .setS3SelectPushdownMaxConnections(500) .setTemporaryStagingDirectoryEnabled(true) .setTemporaryStagingDirectoryPath("/tmp/presto-${USER}") .setDelegateTransactionalManagedTableLocationToMetastore(false) @@ -112,7 +109,6 @@ public void testDefaults() .setProjectionPushdownEnabled(true) .setDynamicFilteringWaitTimeout(new Duration(0, TimeUnit.MINUTES)) .setTimestampPrecision(HiveTimestampPrecision.DEFAULT_PRECISION) - .setOptimizeSymlinkListing(true) .setIcebergCatalogName(null) .setSizeBasedSplitWeightsEnabled(true) .setMinimumAssignedSplitWeight(0.05) @@ -178,9 +174,6 @@ public void testExplicitPropertyMappings() .put("hive.partition-statistics-sample-size", "1234") .put("hive.ignore-corrupted-statistics", "true") .put("hive.collect-column-statistics-on-write", "false") - .put("hive.s3select-pushdown.enabled", "true") - .put("hive.s3select-pushdown.experimental-textfile-pushdown-enabled", "true") - .put("hive.s3select-pushdown.max-connections", "1234") .put("hive.temporary-staging-directory-enabled", "false") .put("hive.temporary-staging-directory-path", "updated") .put("hive.delegate-transactional-managed-table-location-to-metastore", "true") @@ -199,7 +192,6 @@ public void testExplicitPropertyMappings() .put("hive.projection-pushdown-enabled", "false") .put("hive.dynamic-filtering.wait-timeout", "10s") .put("hive.timestamp-precision", "NANOSECONDS") - .put("hive.optimize-symlink-listing", "false") .put("hive.iceberg-catalog-name", "iceberg") .put("hive.size-based-split-weights-enabled", "false") .put("hive.minimum-assigned-split-weight", "1.0") @@ -262,9 +254,6 @@ public void testExplicitPropertyMappings() .setPartitionStatisticsSampleSize(1234) .setIgnoreCorruptedStatistics(true) .setCollectColumnStatisticsOnWrite(false) - .setS3SelectPushdownEnabled(true) - .setS3SelectExperimentalPushdownEnabled(true) - .setS3SelectPushdownMaxConnections(1234) .setTemporaryStagingDirectoryEnabled(false) .setTemporaryStagingDirectoryPath("updated") .setDelegateTransactionalManagedTableLocationToMetastore(true) @@ -283,7 +272,6 @@ public void testExplicitPropertyMappings() .setProjectionPushdownEnabled(false) .setDynamicFilteringWaitTimeout(new Duration(10, TimeUnit.SECONDS)) .setTimestampPrecision(HiveTimestampPrecision.NANOSECONDS) - .setOptimizeSymlinkListing(false) .setIcebergCatalogName("iceberg") .setSizeBasedSplitWeightsEnabled(false) .setMinimumAssignedSplitWeight(1.0) diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveCreateExternalTable.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveCreateExternalTable.java index d8b590ffc8ad3..a32a0ef7dc461 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveCreateExternalTable.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveCreateExternalTable.java @@ -54,13 +54,13 @@ public void testCreateExternalTableWithData() throws IOException { Path tempDir = createTempDirectory(null); - Path tableLocation = tempDir.resolve("data"); + String tableLocation = tempDir.resolve("data").toUri().toString(); @Language("SQL") String createTableSql = format("" + "CREATE TABLE test_create_external " + "WITH (external_location = '%s') AS " + "SELECT * FROM tpch.tiny.nation", - tableLocation.toUri().toASCIIString()); + tableLocation); assertUpdate(createTableSql, 25); @@ -70,7 +70,7 @@ public void testCreateExternalTableWithData() MaterializedResult result = computeActual("SELECT DISTINCT regexp_replace(\"$path\", '/[^/]*$', '/') FROM test_create_external"); String tablePath = (String) result.getOnlyValue(); - assertThat(tablePath).startsWith(tableLocation.toFile().toURI().toString()); + assertThat(tablePath).startsWith(tableLocation); assertUpdate("DROP TABLE test_create_external"); deleteRecursively(tempDir, ALLOW_INSECURE); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java index 36d39e8e1f6f3..968e996718626 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFileFormats.java @@ -16,8 +16,6 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; -import io.airlift.compress.lzo.LzoCodec; -import io.airlift.compress.lzo.LzopCodec; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.filesystem.hdfs.HdfsFileSystemFactory; @@ -47,12 +45,9 @@ import io.trino.plugin.hive.rcfile.RcFilePageSourceFactory; import io.trino.spi.connector.ConnectorPageSource; import io.trino.spi.connector.ConnectorSession; -import io.trino.spi.connector.RecordCursor; -import io.trino.spi.connector.RecordPageSource; import io.trino.spi.predicate.TupleDomain; import io.trino.spi.type.Type; import io.trino.testing.TestingConnectorSession; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; @@ -85,7 +80,6 @@ import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.ImmutableList.toImmutableList; import static io.airlift.slice.Slices.utf8Slice; -import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; import static io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping.buildColumnMappings; import static io.trino.plugin.hive.HiveStorageFormat.AVRO; import static io.trino.plugin.hive.HiveStorageFormat.CSV; @@ -101,7 +95,6 @@ import static io.trino.plugin.hive.HiveTestUtils.HDFS_FILE_SYSTEM_FACTORY; import static io.trino.plugin.hive.HiveTestUtils.HDFS_FILE_SYSTEM_STATS; import static io.trino.plugin.hive.HiveTestUtils.SESSION; -import static io.trino.plugin.hive.HiveTestUtils.createGenericHiveRecordCursorProvider; import static io.trino.plugin.hive.HiveTestUtils.getHiveSession; import static io.trino.plugin.hive.HiveTestUtils.getTypes; import static io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION; @@ -170,7 +163,6 @@ public void testTextFile(int rowCount, long fileSizePadding) .withRowsCount(rowCount) .withFileSizePadding(fileSizePadding) .withFileWriterFactory(new SimpleTextFileWriterFactory(HDFS_FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER)) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new SimpleTextFilePageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig())); } @@ -188,7 +180,6 @@ public void testSequenceFile(int rowCount, long fileSizePadding) .withRowsCount(rowCount) .withFileSizePadding(fileSizePadding) .withFileWriterFactory(new SimpleSequenceFileWriterFactory(HDFS_FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER, new NodeVersion("test"))) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new SimpleSequenceFilePageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig())); } @@ -208,7 +199,6 @@ public void testCsvFile(int rowCount, long fileSizePadding) .withRowsCount(rowCount) .withFileSizePadding(fileSizePadding) .withFileWriterFactory(new CsvFileWriterFactory(HDFS_FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER)) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new CsvPageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig())); } @@ -222,7 +212,6 @@ public void testCsvFileWithNullAndValue() new TestColumn("t_string", javaStringObjectInspector, "test", utf8Slice("test")))) .withRowsCount(2) .withFileWriterFactory(new CsvFileWriterFactory(HDFS_FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER)) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new CsvPageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig())); } @@ -253,7 +242,6 @@ public void testJson(int rowCount, long fileSizePadding) .withRowsCount(rowCount) .withFileSizePadding(fileSizePadding) .withFileWriterFactory(new JsonFileWriterFactory(HDFS_FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER)) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new JsonPageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig())); } @@ -300,7 +288,6 @@ public void testRcTextOptimizedWriter(int rowCount) .withColumns(testColumns) .withRowsCount(rowCount) .withFileWriterFactory(new RcFileFileWriterFactory(FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE)) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new RcFilePageSourceFactory(FILE_SYSTEM_FACTORY, new HiveConfig())); } @@ -344,8 +331,7 @@ public void testRcBinaryOptimizedWriter(int rowCount) .withSkipGenericWriterTest() .withFileWriterFactory(new RcFileFileWriterFactory(FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE)) .isReadableByPageSource(new RcFilePageSourceFactory(FILE_SYSTEM_FACTORY, new HiveConfig())) - .withColumns(testColumnsNoTimestamps) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + .withColumns(testColumnsNoTimestamps); } @Test(dataProvider = "validRowAndFileSizePadding") @@ -365,7 +351,6 @@ public void testOrcOptimizedWriter(int rowCount, long fileSizePadding) { HiveSessionProperties hiveSessionProperties = new HiveSessionProperties( new HiveConfig(), - new HiveFormatsConfig(), new OrcReaderConfig(), new OrcWriterConfig() .setValidationPercentage(100.0), @@ -386,7 +371,6 @@ public void testOrcOptimizedWriter(int rowCount, long fileSizePadding) .withSession(session) .withFileSizePadding(fileSizePadding) .withFileWriterFactory(new OrcFileWriterFactory(TESTING_TYPE_MANAGER, new NodeVersion("test"), STATS, new OrcWriterOptions(), HDFS_FILE_SYSTEM_FACTORY)) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new OrcPageSourceFactory(new OrcReaderOptions(), HDFS_FILE_SYSTEM_FACTORY, STATS, UTC)); } @@ -435,8 +419,7 @@ public void testAvro(int rowCount, long fileSizePadding) .withRowsCount(rowCount) .withFileSizePadding(fileSizePadding) .withFileWriterFactory(new AvroFileWriterFactory(FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER, new NodeVersion("test_version"))) - .isReadableByPageSource(new AvroPageSourceFactory(FILE_SYSTEM_FACTORY)) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + .isReadableByPageSource(new AvroPageSourceFactory(FILE_SYSTEM_FACTORY)); } @Test(dataProvider = "rowCount") @@ -451,7 +434,6 @@ public void testAvroFileInSymlinkTable(int rowCount) Properties splitProperties = new Properties(); splitProperties.setProperty(FILE_INPUT_FORMAT, SymlinkTextInputFormat.class.getName()); splitProperties.setProperty(SERIALIZATION_LIB, AVRO.getSerde()); - testCursorProvider(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), split, splitProperties, getTestColumnsSupportedByAvro(), SESSION, file.length(), rowCount); testPageSourceFactory(new AvroPageSourceFactory(FILE_SYSTEM_FACTORY), split, AVRO, getTestColumnsSupportedByAvro(), SESSION, file.length(), rowCount); } finally { @@ -571,14 +553,12 @@ public void testTruncateVarcharColumn() assertThatFileFormat(RCTEXT) .withWriteColumns(ImmutableList.of(writeColumn)) .withReadColumns(ImmutableList.of(readColumn)) - .isReadableByPageSource(new RcFilePageSourceFactory(FILE_SYSTEM_FACTORY, new HiveConfig())) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + .isReadableByPageSource(new RcFilePageSourceFactory(FILE_SYSTEM_FACTORY, new HiveConfig())); assertThatFileFormat(RCBINARY) .withWriteColumns(ImmutableList.of(writeColumn)) .withReadColumns(ImmutableList.of(readColumn)) - .isReadableByPageSource(new RcFilePageSourceFactory(FILE_SYSTEM_FACTORY, new HiveConfig())) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + .isReadableByPageSource(new RcFilePageSourceFactory(FILE_SYSTEM_FACTORY, new HiveConfig())); assertThatFileFormat(ORC) .withWriteColumns(ImmutableList.of(writeColumn)) @@ -595,21 +575,18 @@ public void testTruncateVarcharColumn() .withWriteColumns(ImmutableList.of(writeColumn)) .withReadColumns(ImmutableList.of(readColumn)) .withFileWriterFactory(new AvroFileWriterFactory(FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER, new NodeVersion("test_version"))) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new AvroPageSourceFactory(FILE_SYSTEM_FACTORY)); assertThatFileFormat(SEQUENCEFILE) .withWriteColumns(ImmutableList.of(writeColumn)) .withReadColumns(ImmutableList.of(readColumn)) .withFileWriterFactory(new SimpleSequenceFileWriterFactory(HDFS_FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER, new NodeVersion("test"))) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new SimpleSequenceFilePageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig())); assertThatFileFormat(TEXTFILE) .withWriteColumns(ImmutableList.of(writeColumn)) .withReadColumns(ImmutableList.of(readColumn)) .withFileWriterFactory(new SimpleTextFileWriterFactory(HDFS_FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER)) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new SimpleTextFilePageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig())); } @@ -634,7 +611,6 @@ public void testAvroProjectedColumns(int rowCount) .withReadColumns(readColumns) .withRowsCount(rowCount) .withFileWriterFactory(new AvroFileWriterFactory(FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER, new NodeVersion("test_version"))) - .isReadableByRecordCursorPageSource(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new AvroPageSourceFactory(FILE_SYSTEM_FACTORY)); } @@ -724,7 +700,6 @@ public void testSequenceFileProjectedColumns(int rowCount) .withReadColumns(readColumns) .withRowsCount(rowCount) .withFileWriterFactory(new SimpleSequenceFileWriterFactory(HDFS_FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER, new NodeVersion("test"))) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new SimpleSequenceFilePageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig())); } @@ -753,41 +728,9 @@ public void testTextFileProjectedColumns(int rowCount) .withReadColumns(readColumns) .withRowsCount(rowCount) .withFileWriterFactory(new SimpleTextFileWriterFactory(HDFS_FILE_SYSTEM_FACTORY, TESTING_TYPE_MANAGER)) - .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) .isReadableByPageSource(new SimpleTextFilePageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig())); } - @Test(dataProvider = "rowCount") - public void testRCTextProjectedColumns(int rowCount) - throws Exception - { - List supportedColumns = TEST_COLUMNS.stream() - .filter(testColumn -> { - // TODO: This is a bug in the RC text reader - // RC file does not support complex type as key of a map - return !testColumn.getName().equals("t_struct_null") - && !testColumn.getName().equals("t_map_null_key_complex_key_value"); - }) - .collect(toImmutableList()); - - List regularColumns = getRegularColumns(supportedColumns); - List partitionColumns = getPartitionColumns(supportedColumns); - - // Created projected columns for all regular supported columns - ImmutableList.Builder writeColumnsBuilder = ImmutableList.builder(); - ImmutableList.Builder readeColumnsBuilder = ImmutableList.builder(); - generateProjectedColumns(regularColumns, writeColumnsBuilder, readeColumnsBuilder); - - List writeColumns = writeColumnsBuilder.addAll(partitionColumns).build(); - List readColumns = readeColumnsBuilder.addAll(partitionColumns).build(); - - assertThatFileFormat(RCTEXT) - .withWriteColumns(writeColumns) - .withReadColumns(readColumns) - .withRowsCount(rowCount) - .isReadableByRecordCursorPageSource(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); - } - @Test(dataProvider = "rowCount") public void testRCTextProjectedColumnsPageSource(int rowCount) throws Exception @@ -888,13 +831,11 @@ public void testFailForLongVarcharPartitionColumn() assertThatFileFormat(RCTEXT) .withColumns(columns) - .isFailingForPageSource(new RcFilePageSourceFactory(FILE_SYSTEM_FACTORY, new HiveConfig()), expectedErrorCode, expectedMessage) - .isFailingForRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage); + .isFailingForPageSource(new RcFilePageSourceFactory(FILE_SYSTEM_FACTORY, new HiveConfig()), expectedErrorCode, expectedMessage); assertThatFileFormat(RCBINARY) .withColumns(columns) - .isFailingForPageSource(new RcFilePageSourceFactory(FILE_SYSTEM_FACTORY, new HiveConfig()), expectedErrorCode, expectedMessage) - .isFailingForRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage); + .isFailingForPageSource(new RcFilePageSourceFactory(FILE_SYSTEM_FACTORY, new HiveConfig()), expectedErrorCode, expectedMessage); assertThatFileFormat(ORC) .withColumns(columns) @@ -904,141 +845,6 @@ public void testFailForLongVarcharPartitionColumn() .withColumns(columns) .withSession(PARQUET_SESSION) .isFailingForPageSource(PARQUET_PAGE_SOURCE_FACTORY, expectedErrorCode, expectedMessage); - - assertThatFileFormat(SEQUENCEFILE) - .withColumns(columns) - .isFailingForRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage); - - assertThatFileFormat(TEXTFILE) - .withColumns(columns) - .isFailingForRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage); - } - - private void testRecordPageSource( - HiveRecordCursorProvider cursorProvider, - FileSplit split, - HiveStorageFormat storageFormat, - List testReadColumns, - ConnectorSession session, - long fileSize, - int rowCount) - throws Exception - { - Properties splitProperties = new Properties(); - splitProperties.setProperty(FILE_INPUT_FORMAT, storageFormat.getInputFormat()); - splitProperties.setProperty(SERIALIZATION_LIB, storageFormat.getSerde()); - ConnectorPageSource pageSource = createPageSourceFromCursorProvider(cursorProvider, split, splitProperties, fileSize, testReadColumns, session); - checkPageSource(pageSource, testReadColumns, getTypes(getColumnHandles(testReadColumns)), rowCount); - } - - private void testCursorProvider( - HiveRecordCursorProvider cursorProvider, - FileSplit split, - HiveStorageFormat storageFormat, - List testReadColumns, - ConnectorSession session, - long fileSize, - int rowCount) - { - Properties splitProperties = new Properties(); - splitProperties.setProperty(FILE_INPUT_FORMAT, storageFormat.getInputFormat()); - splitProperties.setProperty(SERIALIZATION_LIB, storageFormat.getSerde()); - testCursorProvider(cursorProvider, split, splitProperties, testReadColumns, session, fileSize, rowCount); - } - - private void testCursorProvider( - HiveRecordCursorProvider cursorProvider, - FileSplit split, - Properties splitProperties, - List testReadColumns, - ConnectorSession session, - long fileSize, - int rowCount) - { - ConnectorPageSource pageSource = createPageSourceFromCursorProvider(cursorProvider, split, splitProperties, fileSize, testReadColumns, session); - RecordCursor cursor = ((RecordPageSource) pageSource).getCursor(); - checkCursor(cursor, testReadColumns, rowCount); - } - - private ConnectorPageSource createPageSourceFromCursorProvider( - HiveRecordCursorProvider cursorProvider, - FileSplit split, - Properties splitProperties, - long fileSize, - List testReadColumns, - ConnectorSession session) - { - // Use full columns in split properties - ImmutableList.Builder splitPropertiesColumnNames = ImmutableList.builder(); - ImmutableList.Builder splitPropertiesColumnTypes = ImmutableList.builder(); - Set baseColumnNames = new HashSet<>(); - - for (TestColumn testReadColumn : testReadColumns) { - String name = testReadColumn.getBaseName(); - if (!baseColumnNames.contains(name) && !testReadColumn.isPartitionKey()) { - baseColumnNames.add(name); - splitPropertiesColumnNames.add(name); - splitPropertiesColumnTypes.add(testReadColumn.getBaseObjectInspector().getTypeName()); - } - } - - splitProperties.setProperty( - "columns", - splitPropertiesColumnNames.build().stream() - .collect(Collectors.joining(","))); - - splitProperties.setProperty( - "columns.types", - splitPropertiesColumnTypes.build().stream() - .collect(Collectors.joining(","))); - - List partitionKeys = testReadColumns.stream() - .filter(TestColumn::isPartitionKey) - .map(input -> new HivePartitionKey(input.getName(), (String) input.getWriteValue())) - .collect(toList()); - - String partitionName = String.join("/", partitionKeys.stream() - .map(partitionKey -> format("%s=%s", partitionKey.getName(), partitionKey.getValue())) - .collect(toImmutableList())); - - Configuration configuration = newEmptyConfiguration(); - configuration.set("io.compression.codecs", LzoCodec.class.getName() + "," + LzopCodec.class.getName()); - - List columnHandles = getColumnHandles(testReadColumns); - List columnMappings = buildColumnMappings( - partitionName, - partitionKeys, - columnHandles, - ImmutableList.of(), - TableToPartitionMapping.empty(), - split.getPath().toString(), - OptionalInt.empty(), - fileSize, - Instant.now().toEpochMilli()); - - Optional pageSource = HivePageSourceProvider.createHivePageSource( - ImmutableSet.of(), - ImmutableSet.of(cursorProvider), - configuration, - session, - Location.of(split.getPath().toString()), - OptionalInt.empty(), - split.getStart(), - split.getLength(), - fileSize, - splitProperties, - TupleDomain.all(), - columnHandles, - TESTING_TYPE_MANAGER, - Optional.empty(), - Optional.empty(), - false, - Optional.empty(), - false, - NO_ACID_TRANSACTION, - columnMappings); - - return pageSource.get(); } private void testPageSourceFactory( @@ -1096,8 +902,6 @@ private void testPageSourceFactory( Optional pageSource = HivePageSourceProvider.createHivePageSource( ImmutableSet.of(sourceFactory), - ImmutableSet.of(), - newEmptyConfiguration(), session, Location.of(split.getPath().toString()), OptionalInt.empty(), @@ -1106,11 +910,9 @@ private void testPageSourceFactory( fileSize, splitProperties, TupleDomain.all(), - columnHandles, TESTING_TYPE_MANAGER, Optional.empty(), Optional.empty(), - false, Optional.empty(), false, NO_ACID_TRANSACTION, @@ -1305,39 +1107,18 @@ public FileFormatAssertion withFileSizePadding(long fileSizePadding) public FileFormatAssertion isReadableByPageSource(HivePageSourceFactory pageSourceFactory) throws Exception { - assertRead(Optional.of(pageSourceFactory), Optional.empty(), false); - return this; - } - - public FileFormatAssertion isReadableByRecordCursorPageSource(HiveRecordCursorProvider cursorProvider) - throws Exception - { - assertRead(Optional.empty(), Optional.of(cursorProvider), true); - return this; - } - - public FileFormatAssertion isReadableByRecordCursor(HiveRecordCursorProvider cursorProvider) - throws Exception - { - assertRead(Optional.empty(), Optional.of(cursorProvider), false); + assertRead(Optional.of(pageSourceFactory)); return this; } public FileFormatAssertion isFailingForPageSource(HivePageSourceFactory pageSourceFactory, HiveErrorCode expectedErrorCode, String expectedMessage) throws Exception { - assertFailure(Optional.of(pageSourceFactory), Optional.empty(), expectedErrorCode, expectedMessage, false); + assertFailure(Optional.of(pageSourceFactory), expectedErrorCode, expectedMessage); return this; } - public FileFormatAssertion isFailingForRecordCursor(HiveRecordCursorProvider cursorProvider, HiveErrorCode expectedErrorCode, String expectedMessage) - throws Exception - { - assertFailure(Optional.empty(), Optional.of(cursorProvider), expectedErrorCode, expectedMessage, false); - return this; - } - - private void assertRead(Optional pageSourceFactory, Optional cursorProvider, boolean withRecordPageSource) + private void assertRead(Optional pageSourceFactory) throws Exception { assertNotNull(storageFormat, "storageFormat must be specified"); @@ -1372,14 +1153,6 @@ private void assertRead(Optional pageSourceFactory, Optio if (pageSourceFactory.isPresent()) { testPageSourceFactory(pageSourceFactory.get(), split, storageFormat, readColumns, session, fileSize, rowsCount); } - if (cursorProvider.isPresent()) { - if (withRecordPageSource) { - testRecordPageSource(cursorProvider.get(), split, storageFormat, readColumns, session, fileSize, rowsCount); - } - else { - testCursorProvider(cursorProvider.get(), split, storageFormat, readColumns, session, fileSize, rowsCount); - } - } } finally { //noinspection ResultOfMethodCallIgnored @@ -1390,12 +1163,10 @@ private void assertRead(Optional pageSourceFactory, Optio private void assertFailure( Optional pageSourceFactory, - Optional cursorProvider, HiveErrorCode expectedErrorCode, - String expectedMessage, - boolean withRecordPageSource) + String expectedMessage) { - assertTrinoExceptionThrownBy(() -> assertRead(pageSourceFactory, cursorProvider, withRecordPageSource)) + assertTrinoExceptionThrownBy(() -> assertRead(pageSourceFactory)) .hasErrorCode(expectedErrorCode) .hasMessage(expectedMessage); } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFormatsConfig.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFormatsConfig.java deleted file mode 100644 index 2c66aab6a3a63..0000000000000 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveFormatsConfig.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive; - -import com.google.common.collect.ImmutableMap; -import org.testng.annotations.Test; - -import java.util.Map; - -import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; -import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; -import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; - -public class TestHiveFormatsConfig -{ - @Test - public void testDefaults() - { - assertRecordedDefaults(recordDefaults(HiveFormatsConfig.class) - .setAvroFileNativeReaderEnabled(true) - .setAvroFileNativeWriterEnabled(true) - .setCsvNativeReaderEnabled(true) - .setCsvNativeWriterEnabled(true) - .setJsonNativeReaderEnabled(true) - .setJsonNativeWriterEnabled(true) - .setOpenXJsonNativeReaderEnabled(true) - .setOpenXJsonNativeWriterEnabled(true) - .setRegexNativeReaderEnabled(true) - .setTextFileNativeReaderEnabled(true) - .setTextFileNativeWriterEnabled(true) - .setSequenceFileNativeReaderEnabled(true) - .setSequenceFileNativeWriterEnabled(true)); - } - - @Test - public void testExplicitPropertyMappings() - { - Map properties = ImmutableMap.builder() - .put("avro.native-reader.enabled", "false") - .put("avro.native-writer.enabled", "false") - .put("csv.native-reader.enabled", "false") - .put("csv.native-writer.enabled", "false") - .put("json.native-reader.enabled", "false") - .put("json.native-writer.enabled", "false") - .put("openx-json.native-reader.enabled", "false") - .put("openx-json.native-writer.enabled", "false") - .put("regex.native-reader.enabled", "false") - .put("text-file.native-reader.enabled", "false") - .put("text-file.native-writer.enabled", "false") - .put("sequence-file.native-reader.enabled", "false") - .put("sequence-file.native-writer.enabled", "false") - .buildOrThrow(); - - HiveFormatsConfig expected = new HiveFormatsConfig() - .setAvroFileNativeReaderEnabled(false) - .setAvroFileNativeWriterEnabled(false) - .setCsvNativeReaderEnabled(false) - .setCsvNativeWriterEnabled(false) - .setJsonNativeReaderEnabled(false) - .setJsonNativeWriterEnabled(false) - .setOpenXJsonNativeReaderEnabled(false) - .setOpenXJsonNativeWriterEnabled(false) - .setRegexNativeReaderEnabled(false) - .setTextFileNativeReaderEnabled(false) - .setTextFileNativeWriterEnabled(false) - .setSequenceFileNativeReaderEnabled(false) - .setSequenceFileNativeWriterEnabled(false); - - assertFullMapping(properties, expected); - } -} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java index 66db3957b6fd9..ff22bf6f76f5f 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHivePageSink.java @@ -69,7 +69,6 @@ import static io.trino.plugin.hive.HiveTestUtils.PAGE_SORTER; import static io.trino.plugin.hive.HiveTestUtils.getDefaultHiveFileWriterFactories; import static io.trino.plugin.hive.HiveTestUtils.getDefaultHivePageSourceFactories; -import static io.trino.plugin.hive.HiveTestUtils.getDefaultHiveRecordCursorProviders; import static io.trino.plugin.hive.HiveTestUtils.getHiveSession; import static io.trino.plugin.hive.HiveTestUtils.getHiveSessionProperties; import static io.trino.plugin.hive.HiveType.HIVE_DATE; @@ -267,17 +266,13 @@ private static ConnectorPageSource createPageSource(HiveTransactionHandle transa TableToPartitionMapping.empty(), Optional.empty(), Optional.empty(), - false, Optional.empty(), SplitWeight.standard()); ConnectorTableHandle table = new HiveTableHandle(SCHEMA_NAME, TABLE_NAME, ImmutableMap.of(), ImmutableList.of(), ImmutableList.of(), Optional.empty()); HivePageSourceProvider provider = new HivePageSourceProvider( TESTING_TYPE_MANAGER, - HDFS_ENVIRONMENT, config, - getDefaultHivePageSourceFactories(HDFS_ENVIRONMENT, config), - getDefaultHiveRecordCursorProviders(config, HDFS_ENVIRONMENT), - new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT, config)); + getDefaultHivePageSourceFactories(HDFS_ENVIRONMENT, config)); return provider.createPageSource(transaction, getHiveSession(config), split, table, ImmutableList.copyOf(getColumnHandles()), DynamicFilter.EMPTY); } @@ -304,7 +299,6 @@ private static ConnectorPageSink createPageSink(HiveTransactionHandle transactio HivePageSinkProvider provider = new HivePageSinkProvider( getDefaultHiveFileWriterFactories(config, HDFS_ENVIRONMENT), HDFS_FILE_SYSTEM_FACTORY, - HDFS_ENVIRONMENT, PAGE_SORTER, HiveMetastoreFactory.ofInstance(metastore), new GroupByHashPageIndexerFactory(new JoinCompiler(typeOperators), typeOperators), diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplit.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplit.java index d90fd803da746..fba3cde81a11a 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplit.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplit.java @@ -79,7 +79,6 @@ public void testJsonRoundTrip() 16, ImmutableList.of(createBaseColumn("col", 5, HIVE_LONG, BIGINT, ColumnType.REGULAR, Optional.of("comment"))))), Optional.empty(), - false, Optional.of(acidInfo), SplitWeight.fromProportion(2.0)); // some non-standard value @@ -98,7 +97,6 @@ public void testJsonRoundTrip() assertEquals(actual.getTableToPartitionMapping().getTableToPartitionColumns(), expected.getTableToPartitionMapping().getTableToPartitionColumns()); assertEquals(actual.getBucketConversion(), expected.getBucketConversion()); assertEquals(actual.isForceLocalScheduling(), expected.isForceLocalScheduling()); - assertEquals(actual.isS3SelectPushdownEnabled(), expected.isS3SelectPushdownEnabled()); assertEquals(actual.getAcidInfo().get(), expected.getAcidInfo().get()); assertEquals(actual.getSplitWeight(), expected.getSplitWeight()); } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplitSource.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplitSource.java index 22e008077bfe7..da8b9f9c19d55 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplitSource.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestHiveSplitSource.java @@ -335,7 +335,6 @@ private TestSplit(int id, OptionalInt bucketNumber, DataSize fileSize, BooleanSu TableToPartitionMapping.empty(), Optional.empty(), Optional.empty(), - false, Optional.empty(), partitionMatchSupplier); } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestNodeLocalDynamicSplitPruning.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestNodeLocalDynamicSplitPruning.java index 57327dba08802..04350213ce986 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestNodeLocalDynamicSplitPruning.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestNodeLocalDynamicSplitPruning.java @@ -46,7 +46,6 @@ import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; import static io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT; import static io.trino.plugin.hive.HiveTestUtils.getDefaultHivePageSourceFactories; -import static io.trino.plugin.hive.HiveTestUtils.getDefaultHiveRecordCursorProviders; import static io.trino.plugin.hive.HiveType.HIVE_INT; import static io.trino.plugin.hive.util.HiveBucketing.BucketingVersion.BUCKETING_V1; import static io.trino.spi.type.IntegerType.INTEGER; @@ -135,7 +134,6 @@ private static ConnectorPageSource createTestingPageSource(HiveTransactionHandle TableToPartitionMapping.empty(), Optional.empty(), Optional.empty(), - false, Optional.empty(), SplitWeight.standard()); @@ -157,11 +155,8 @@ private static ConnectorPageSource createTestingPageSource(HiveTransactionHandle HivePageSourceProvider provider = new HivePageSourceProvider( TESTING_TYPE_MANAGER, - HDFS_ENVIRONMENT, hiveConfig, - getDefaultHivePageSourceFactories(HDFS_ENVIRONMENT, hiveConfig), - getDefaultHiveRecordCursorProviders(hiveConfig, HDFS_ENVIRONMENT), - new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT, hiveConfig)); + getDefaultHivePageSourceFactories(HDFS_ENVIRONMENT, hiveConfig)); return provider.createPageSource( transaction, @@ -209,7 +204,6 @@ private static TestingConnectorSession getSession(HiveConfig config) return TestingConnectorSession.builder() .setPropertyMetadata(new HiveSessionProperties( config, - new HiveFormatsConfig(), new OrcReaderConfig(), new OrcWriterConfig(), new ParquetReaderConfig(), diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestOrcPageSourceMemoryTracking.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestOrcPageSourceMemoryTracking.java index d3278c0345af0..386f7f19dbb4e 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestOrcPageSourceMemoryTracking.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/TestOrcPageSourceMemoryTracking.java @@ -325,7 +325,6 @@ public void testMaxReadBytes(int rowCount) int maxReadBytes = 1_000; HiveSessionProperties hiveSessionProperties = new HiveSessionProperties( new HiveConfig(), - new HiveFormatsConfig(), new OrcReaderConfig() .setMaxBlockSize(DataSize.ofBytes(maxReadBytes)), new OrcWriterConfig(), @@ -568,8 +567,6 @@ public ConnectorPageSource newPageSource(FileFormatDataSourceStats stats, Connec ConnectorPageSource connectorPageSource = HivePageSourceProvider.createHivePageSource( ImmutableSet.of(orcPageSourceFactory), - ImmutableSet.of(), - newEmptyConfiguration(), session, Location.of(fileSplit.getPath().toString()), OptionalInt.empty(), @@ -578,11 +575,9 @@ public ConnectorPageSource newPageSource(FileFormatDataSourceStats stats, Connec fileSplit.getLength(), schema, TupleDomain.all(), - columns, TESTING_TYPE_MANAGER, Optional.empty(), Optional.empty(), - false, Optional.empty(), false, NO_ACID_TRANSACTION, diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AvroSchemaGenerationTests.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/avro/TestAvroSchemaGeneration.java similarity index 94% rename from plugin/trino-hive/src/test/java/io/trino/plugin/hive/AvroSchemaGenerationTests.java rename to plugin/trino-hive/src/test/java/io/trino/plugin/hive/avro/TestAvroSchemaGeneration.java index 554772fa76016..484709c64ccbf 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/AvroSchemaGenerationTests.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/avro/TestAvroSchemaGeneration.java @@ -11,12 +11,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.plugin.hive; +package io.trino.plugin.hive.avro; import io.trino.filesystem.local.LocalFileSystem; import io.trino.hadoop.ConfigurationInstantiator; -import io.trino.plugin.hive.avro.AvroHiveFileUtils; -import io.trino.plugin.hive.avro.TrinoAvroSerDe; +import io.trino.plugin.hive.HiveType; import io.trino.plugin.hive.type.TypeInfo; import io.trino.spi.type.RowType; import io.trino.spi.type.VarcharType; @@ -34,7 +33,7 @@ import static io.trino.plugin.hive.util.SerdeConstants.LIST_COLUMN_TYPES; import static org.assertj.core.api.Assertions.assertThat; -public class AvroSchemaGenerationTests +public class TestAvroSchemaGeneration { @Test public void testOldVsNewSchemaGeneration() diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/TrinoAvroSerDe.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/avro/TrinoAvroSerDe.java similarity index 100% rename from plugin/trino-hive/src/main/java/io/trino/plugin/hive/avro/TrinoAvroSerDe.java rename to plugin/trino-hive/src/test/java/io/trino/plugin/hive/avro/TrinoAvroSerDe.java diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/AbstractFileFormat.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/AbstractFileFormat.java index d6afdddd1a542..77143b82792ae 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/AbstractFileFormat.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/AbstractFileFormat.java @@ -18,13 +18,10 @@ import com.google.common.collect.ImmutableSet; import io.trino.filesystem.Location; import io.trino.hdfs.HdfsEnvironment; -import io.trino.plugin.hive.GenericHiveRecordCursorProvider; import io.trino.plugin.hive.HiveColumnHandle; import io.trino.plugin.hive.HiveConfig; import io.trino.plugin.hive.HivePageSourceFactory; import io.trino.plugin.hive.HivePageSourceProvider; -import io.trino.plugin.hive.HiveRecordCursorProvider; -import io.trino.plugin.hive.HiveRecordCursorProvider.ReaderRecordCursorWithProjections; import io.trino.plugin.hive.HiveSplit; import io.trino.plugin.hive.HiveStorageFormat; import io.trino.plugin.hive.HiveTableHandle; @@ -37,7 +34,6 @@ import io.trino.spi.connector.ConnectorPageSource; import io.trino.spi.connector.ConnectorSession; import io.trino.spi.connector.DynamicFilter; -import io.trino.spi.connector.RecordPageSource; import io.trino.spi.predicate.TupleDomain; import io.trino.spi.type.Type; import io.trino.sql.planner.TestingConnectorTransactionHandle; @@ -82,18 +78,6 @@ public boolean supportsDate() return true; } - @Override - public Optional getHivePageSourceFactory(HdfsEnvironment environment) - { - return Optional.empty(); - } - - @Override - public Optional getHiveRecordCursorProvider(HdfsEnvironment environment) - { - return Optional.empty(); - } - @Override public ConnectorPageSource createFileFormatReader( ConnectorSession session, @@ -102,16 +86,7 @@ public ConnectorPageSource createFileFormatReader( List columnNames, List columnTypes) { - Optional pageSourceFactory = getHivePageSourceFactory(hdfsEnvironment); - Optional recordCursorProvider = getHiveRecordCursorProvider(hdfsEnvironment); - - checkArgument(pageSourceFactory.isPresent() ^ recordCursorProvider.isPresent()); - - if (pageSourceFactory.isPresent()) { - return createPageSource(pageSourceFactory.get(), session, targetFile, columnNames, columnTypes, getFormat()); - } - - return createPageSource(recordCursorProvider.get(), session, targetFile, columnNames, columnTypes, getFormat()); + return createPageSource(getHivePageSourceFactory(hdfsEnvironment), session, targetFile, columnNames, columnTypes, getFormat()); } @Override @@ -125,11 +100,8 @@ public ConnectorPageSource createGenericReader( { HivePageSourceProvider factory = new HivePageSourceProvider( TESTING_TYPE_MANAGER, - hdfsEnvironment, new HiveConfig(), - getHivePageSourceFactory(hdfsEnvironment).map(ImmutableSet::of).orElse(ImmutableSet.of()), - getHiveRecordCursorProvider(hdfsEnvironment).map(ImmutableSet::of).orElse(ImmutableSet.of()), - new GenericHiveRecordCursorProvider(hdfsEnvironment, new HiveConfig())); + ImmutableSet.of(getHivePageSourceFactory(hdfsEnvironment))); Properties schema = createSchema(getFormat(), schemaColumnNames, schemaColumnTypes); @@ -149,7 +121,6 @@ public ConnectorPageSource createGenericReader( TableToPartitionMapping.empty(), Optional.empty(), Optional.empty(), - false, Optional.empty(), SplitWeight.standard()); @@ -167,36 +138,6 @@ public boolean supports(TestData testData) return true; } - static ConnectorPageSource createPageSource( - HiveRecordCursorProvider cursorProvider, - ConnectorSession session, - File targetFile, - List columnNames, - List columnTypes, - HiveStorageFormat format) - { - checkArgument(columnNames.size() == columnTypes.size(), "columnNames and columnTypes should have the same size"); - - List readColumns = getBaseColumns(columnNames, columnTypes); - - Optional recordCursorWithProjections = cursorProvider.createRecordCursor( - conf, - session, - Location.of(targetFile.getAbsolutePath()), - 0, - targetFile.length(), - targetFile.length(), - createSchema(format, columnNames, columnTypes), - readColumns, - TupleDomain.all(), - TESTING_TYPE_MANAGER, - false); - - checkState(recordCursorWithProjections.isPresent(), "readerPageSourceWithProjections is not present"); - checkState(recordCursorWithProjections.get().getProjectedReaderColumns().isEmpty(), "projection should not be required"); - return new RecordPageSource(columnTypes, recordCursorWithProjections.get().getRecordCursor()); - } - static ConnectorPageSource createPageSource( HivePageSourceFactory pageSourceFactory, ConnectorSession session, diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/FileFormat.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/FileFormat.java index 81e4a9dcde938..4f4f301789bd8 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/FileFormat.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/FileFormat.java @@ -16,7 +16,6 @@ import io.trino.hdfs.HdfsEnvironment; import io.trino.plugin.hive.HiveCompressionCodec; import io.trino.plugin.hive.HivePageSourceFactory; -import io.trino.plugin.hive.HiveRecordCursorProvider; import io.trino.plugin.hive.HiveStorageFormat; import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ConnectorPageSource; @@ -26,7 +25,6 @@ import java.io.File; import java.io.IOException; import java.util.List; -import java.util.Optional; public interface FileFormat { @@ -42,9 +40,7 @@ FormatWriter createFileFormatWriter( boolean supportsDate(); - Optional getHivePageSourceFactory(HdfsEnvironment environment); - - Optional getHiveRecordCursorProvider(HdfsEnvironment environment); + HivePageSourceFactory getHivePageSourceFactory(HdfsEnvironment environment); ConnectorPageSource createFileFormatReader( ConnectorSession session, diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/StandardFileFormats.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/StandardFileFormats.java index 2eb6b6c6f739b..6ab34775965dd 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/StandardFileFormats.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/benchmark/StandardFileFormats.java @@ -70,9 +70,9 @@ public HiveStorageFormat getFormat() } @Override - public Optional getHivePageSourceFactory(HdfsEnvironment hdfsEnvironment) + public HivePageSourceFactory getHivePageSourceFactory(HdfsEnvironment hdfsEnvironment) { - return Optional.of(new RcFilePageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig().setRcfileTimeZone("UTC"))); + return new RcFilePageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig().setRcfileTimeZone("UTC")); } @Override @@ -101,9 +101,9 @@ public HiveStorageFormat getFormat() } @Override - public Optional getHivePageSourceFactory(HdfsEnvironment hdfsEnvironment) + public HivePageSourceFactory getHivePageSourceFactory(HdfsEnvironment hdfsEnvironment) { - return Optional.of(new RcFilePageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig().setRcfileTimeZone("UTC"))); + return new RcFilePageSourceFactory(HDFS_FILE_SYSTEM_FACTORY, new HiveConfig().setRcfileTimeZone("UTC")); } @Override @@ -132,9 +132,9 @@ public HiveStorageFormat getFormat() } @Override - public Optional getHivePageSourceFactory(HdfsEnvironment hdfsEnvironment) + public HivePageSourceFactory getHivePageSourceFactory(HdfsEnvironment hdfsEnvironment) { - return Optional.of(new OrcPageSourceFactory(new OrcReaderOptions(), HDFS_FILE_SYSTEM_FACTORY, new FileFormatDataSourceStats(), UTC)); + return new OrcPageSourceFactory(new OrcReaderOptions(), HDFS_FILE_SYSTEM_FACTORY, new FileFormatDataSourceStats(), UTC); } @Override @@ -163,13 +163,13 @@ public HiveStorageFormat getFormat() } @Override - public Optional getHivePageSourceFactory(HdfsEnvironment hdfsEnvironment) + public HivePageSourceFactory getHivePageSourceFactory(HdfsEnvironment hdfsEnvironment) { - return Optional.of(new ParquetPageSourceFactory( + return new ParquetPageSourceFactory( new HdfsFileSystemFactory(hdfsEnvironment, HDFS_FILE_SYSTEM_STATS), new FileFormatDataSourceStats(), new ParquetReaderConfig(), - new HiveConfig())); + new HiveConfig()); } @Override diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcPredicates.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcPredicates.java index 383bf51e08c79..084223f391932 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcPredicates.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/orc/TestOrcPredicates.java @@ -48,7 +48,6 @@ import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.ImmutableList.toImmutableList; -import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; import static io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping.buildColumnMappings; import static io.trino.plugin.hive.HiveStorageFormat.ORC; import static io.trino.plugin.hive.HiveTestUtils.HDFS_FILE_SYSTEM_FACTORY; @@ -219,8 +218,6 @@ private ConnectorPageSource createPageSource( Optional pageSource = HivePageSourceProvider.createHivePageSource( ImmutableSet.of(readerFactory), - ImmutableSet.of(), - newEmptyConfiguration(), session, Location.of(split.getPath().toString()), OptionalInt.empty(), @@ -229,11 +226,9 @@ private ConnectorPageSource createPageSource( split.getLength(), splitProperties, predicate, - columnHandles, TESTING_TYPE_MANAGER, Optional.empty(), Optional.empty(), - false, Optional.empty(), false, NO_ACID_TRANSACTION, diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/ParquetTester.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/ParquetTester.java index c3883e325b101..b473aae303320 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/ParquetTester.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/ParquetTester.java @@ -29,7 +29,6 @@ import io.trino.parquet.writer.ParquetWriterOptions; import io.trino.plugin.hive.FileFormatDataSourceStats; import io.trino.plugin.hive.HiveConfig; -import io.trino.plugin.hive.HiveFormatsConfig; import io.trino.plugin.hive.HiveSessionProperties; import io.trino.plugin.hive.HiveStorageFormat; import io.trino.plugin.hive.benchmark.FileFormat; @@ -437,7 +436,6 @@ void assertMaxReadBytes( new HiveConfig() .setHiveStorageFormat(HiveStorageFormat.PARQUET) .setUseParquetColumnNames(false), - new HiveFormatsConfig(), new OrcReaderConfig(), new OrcWriterConfig(), new ParquetReaderConfig() diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestOnlyNulls.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestOnlyNulls.java index c432a3974f8be..033b7eda2ef60 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestOnlyNulls.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestOnlyNulls.java @@ -81,7 +81,7 @@ public void testOnlyNulls() private static ConnectorPageSource createPageSource(File parquetFile, HiveColumnHandle column, TupleDomain domain) { - HivePageSourceFactory pageSourceFactory = StandardFileFormats.TRINO_PARQUET.getHivePageSourceFactory(HDFS_ENVIRONMENT).orElseThrow(); + HivePageSourceFactory pageSourceFactory = StandardFileFormats.TRINO_PARQUET.getHivePageSourceFactory(HDFS_ENVIRONMENT); Properties schema = new Properties(); schema.setProperty(SERIALIZATION_LIB, HiveStorageFormat.PARQUET.getSerde()); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestTimestampMicros.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestTimestampMicros.java index 9f1a9584f3305..40cc8ee68bb69 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestTimestampMicros.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/TestTimestampMicros.java @@ -99,7 +99,7 @@ private ConnectorPageSource createPageSource(ConnectorSession session, File parq // TODO after https://github.com/trinodb/trino/pull/5283, replace the method with // return FileFormat.PRESTO_PARQUET.createFileFormatReader(session, HDFS_ENVIRONMENT, parquetFile, columnNames, columnTypes); - HivePageSourceFactory pageSourceFactory = StandardFileFormats.TRINO_PARQUET.getHivePageSourceFactory(HDFS_ENVIRONMENT).orElseThrow(); + HivePageSourceFactory pageSourceFactory = StandardFileFormats.TRINO_PARQUET.getHivePageSourceFactory(HDFS_ENVIRONMENT); Properties schema = new Properties(); schema.setProperty(SERIALIZATION_LIB, HiveStorageFormat.PARQUET.getSerde()); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/write/TestingMapredParquetOutputFormat.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/write/TestingMapredParquetOutputFormat.java index 5e4bcbd95f48d..c49f03c81035c 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/write/TestingMapredParquetOutputFormat.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/parquet/write/TestingMapredParquetOutputFormat.java @@ -20,15 +20,16 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.Progressable; +import org.apache.parquet.hadoop.DisabledMemoryManager; import org.apache.parquet.hadoop.ParquetOutputFormat; import org.apache.parquet.schema.MessageType; import org.joda.time.DateTimeZone; import java.io.IOException; +import java.lang.reflect.Field; import java.util.Optional; import java.util.Properties; -import static io.trino.plugin.hive.parquet.ParquetRecordWriter.replaceHadoopParquetMemoryManager; import static java.util.Objects.requireNonNull; /* @@ -42,8 +43,6 @@ public class TestingMapredParquetOutputFormat extends MapredParquetOutputFormat { static { - // The tests using this class don't use io.trino.plugin.hive.parquet.ParquetRecordWriter for writing parquet files with old writer. - // Therefore, we need to replace the hadoop parquet memory manager here explicitly. replaceHadoopParquetMemoryManager(); } @@ -71,4 +70,16 @@ public FileSinkOperator.RecordWriter getHiveRecordWriter( } return super.getHiveRecordWriter(jobConf, finalOutPath, valueClass, isCompressed, tableProperties, progress); } + + private static void replaceHadoopParquetMemoryManager() + { + try { + Field memoryManager = ParquetOutputFormat.class.getDeclaredField("memoryManager"); + memoryManager.setAccessible(true); + memoryManager.set(null, new DisabledMemoryManager()); + } + catch (ReflectiveOperationException e) { + throw new AssertionError(e); + } + } } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3/TestMinioS3SelectQueries.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3/TestMinioS3SelectQueries.java deleted file mode 100644 index 1048079596444..0000000000000 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3/TestMinioS3SelectQueries.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import io.airlift.units.DataSize; -import io.trino.Session; -import io.trino.plugin.hive.containers.HiveHadoop; -import io.trino.plugin.hive.containers.HiveMinioDataLake; -import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.QueryRunner; -import io.trino.testing.sql.TestTable; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.List; - -import static io.airlift.units.DataSize.Unit.MEGABYTE; -import static io.trino.testing.TestingNames.randomNameSuffix; -import static java.lang.String.format; - -public class TestMinioS3SelectQueries - extends AbstractTestQueryFramework -{ - private static final String HIVE_TEST_SCHEMA = "hive_datalake"; - private static final DataSize HIVE_S3_STREAMING_PART_SIZE = DataSize.of(5, MEGABYTE); - - private String bucketName; - - @Override - protected QueryRunner createQueryRunner() - throws Exception - { - this.bucketName = "test-hive-insert-overwrite-" + randomNameSuffix(); - HiveMinioDataLake hiveMinioDataLake = closeAfterClass(new HiveMinioDataLake(bucketName, HiveHadoop.HIVE3_IMAGE)); - hiveMinioDataLake.start(); - return S3HiveQueryRunner.builder(hiveMinioDataLake) - .setHiveProperties( - ImmutableMap.builder() - .put("hive.non-managed-table-writes-enabled", "true") - .put("hive.metastore-cache-ttl", "1d") - .put("hive.metastore-refresh-interval", "1d") - .put("hive.s3.streaming.part-size", HIVE_S3_STREAMING_PART_SIZE.toString()) - .buildOrThrow()) - .build(); - } - - @BeforeClass - public void setUp() - { - computeActual(format( - "CREATE SCHEMA hive.%1$s WITH (location='s3a://%2$s/%1$s')", - HIVE_TEST_SCHEMA, - bucketName)); - } - - @Test - public void testTextfileQueries() - { - // Demonstrate correctness issues which have resulted in pushdown for TEXTFILE - // using CSV support in S3 Select being put behind a separate "experimental" flag. - // TODO: https://github.com/trinodb/trino/issues/17775 - List values = ImmutableList.of( - "1, true, 11", - "2, true, 22", - "3, NULL, NULL", - "4, false, 44"); - Session withS3SelectPushdown = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "s3_select_pushdown_enabled", "true") - .setCatalogSessionProperty("hive", "json_native_reader_enabled", "false") - .setCatalogSessionProperty("hive", "text_file_native_reader_enabled", "false") - .build(); - Session withoutS3SelectPushdown = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "json_native_reader_enabled", "false") - .setCatalogSessionProperty("hive", "text_file_native_reader_enabled", "false") - .build(); - try (TestTable table = new TestTable( - getQueryRunner()::execute, - "hive.%s.test_textfile_queries".formatted(HIVE_TEST_SCHEMA), - "(id INT, bool_t BOOLEAN, int_t INT) WITH (format = 'TEXTFILE')", - values)) { - assertQuery(withS3SelectPushdown, "SELECT id FROM " + table.getName() + " WHERE int_t IS NULL", "VALUES 3"); - assertQuery(withS3SelectPushdown, "SELECT id FROM " + table.getName() + " WHERE bool_t = true", "VALUES 1, 2"); - } - - List specialCharacterValues = ImmutableList.of( - "1, 'a,comma'", - "2, 'a|pipe'", - "3, 'an''escaped quote'", - "4, 'a~null encoding'"); - try (TestTable table = new TestTable( - getQueryRunner()::execute, - "hive.%s.test_s3_select_pushdown_special_characters".formatted(HIVE_TEST_SCHEMA), - "(id INT, string_t VARCHAR) WITH (format = 'TEXTFILE', textfile_field_separator=',', textfile_field_separator_escape='|', null_format='~')", - specialCharacterValues)) { - String selectWithComma = "SELECT id FROM " + table.getName() + " WHERE string_t = 'a,comma'"; - assertQuery(withoutS3SelectPushdown, selectWithComma, "VALUES 1"); - assertQuery(withS3SelectPushdown, selectWithComma, "VALUES 1"); - - String selectWithPipe = "SELECT id FROM " + table.getName() + " WHERE string_t = 'a|pipe'"; - assertQuery(withoutS3SelectPushdown, selectWithPipe, "VALUES 2"); - assertQuery(withS3SelectPushdown, selectWithPipe, "VALUES 2"); - - String selectWithQuote = "SELECT id FROM " + table.getName() + " WHERE string_t = 'an''escaped quote'"; - assertQuery(withoutS3SelectPushdown, selectWithQuote, "VALUES 3"); - assertQuery(withS3SelectPushdown, selectWithQuote, "VALUES 3"); - - String selectWithNullFormatEncoding = "SELECT id FROM " + table.getName() + " WHERE string_t = 'a~null encoding'"; - assertQuery(withoutS3SelectPushdown, selectWithNullFormatEncoding, "VALUES 4"); - assertQuery(withS3SelectPushdown, selectWithNullFormatEncoding, "VALUES 4"); - } - } -} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3/TestS3SelectQueries.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3/TestS3SelectQueries.java deleted file mode 100644 index 77f542517b3d0..0000000000000 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3/TestS3SelectQueries.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import io.trino.Session; -import io.trino.plugin.hive.HiveQueryRunner; -import io.trino.plugin.hive.NodeVersion; -import io.trino.plugin.hive.metastore.HiveMetastoreConfig; -import io.trino.plugin.hive.metastore.file.FileHiveMetastore; -import io.trino.plugin.hive.metastore.file.FileHiveMetastoreConfig; -import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.MaterializedResult; -import io.trino.testing.QueryRunner; -import io.trino.testing.sql.TestTable; -import org.intellij.lang.annotations.Language; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Parameters; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.List; - -import static io.trino.plugin.hive.HiveTestUtils.HDFS_FILE_SYSTEM_FACTORY; -import static io.trino.testing.TestingNames.randomNameSuffix; -import static java.util.Objects.requireNonNull; -import static org.assertj.core.api.Assertions.assertThat; -import static org.testng.Assert.assertEquals; - -// The test requires AWS credentials be provided via one of the ways used by the DefaultAWSCredentialsProviderChain. -public class TestS3SelectQueries - extends AbstractTestQueryFramework -{ - private final String bucket; - private final String bucketEndpoint; - - @Parameters({"s3.bucket", "s3.bucket-endpoint"}) - public TestS3SelectQueries(String bucket, String bucketEndpoint) - { - this.bucket = requireNonNull(bucket, "bucket is null"); - this.bucketEndpoint = requireNonNull(bucketEndpoint, "bucketEndpoint is null"); - } - - @Override - protected QueryRunner createQueryRunner() - throws Exception - { - ImmutableMap.Builder hiveProperties = ImmutableMap.builder(); - hiveProperties.put("hive.s3.endpoint", bucketEndpoint); - hiveProperties.put("hive.non-managed-table-writes-enabled", "true"); - hiveProperties.put("hive.s3select-pushdown.experimental-textfile-pushdown-enabled", "true"); - return HiveQueryRunner.builder() - .setHiveProperties(hiveProperties.buildOrThrow()) - .setInitialTables(ImmutableList.of()) - .setMetastore(queryRunner -> { - File baseDir = queryRunner.getCoordinator().getBaseDataDir().resolve("hive_data").toFile(); - return new FileHiveMetastore( - new NodeVersion("testversion"), - HDFS_FILE_SYSTEM_FACTORY, - new HiveMetastoreConfig().isHideDeltaLakeTables(), - new FileHiveMetastoreConfig() - .setCatalogDirectory(baseDir.toURI().toString()) - .setMetastoreUser("test") - .setDisableLocationChecks(true)); - }) - .build(); - } - - @Test(dataProvider = "s3SelectFileFormats") - public void testS3SelectPushdown(String tableProperties) - { - Session usingAppendInserts = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "insert_existing_partitions_behavior", "APPEND") - .build(); - List values = ImmutableList.of( - "1, true, 11, 111, 1111, 11111, 'one', DATE '2020-01-01'", - "2, true, 22, 222, 2222, 22222, 'two', DATE '2020-02-02'", - "3, NULL, NULL, NULL, NULL, NULL, NULL, NULL", - "4, false, 44, 444, 4444, 44444, '', DATE '2020-04-04'"); - try (TestTable table = new TestTable( - sql -> getQueryRunner().execute(usingAppendInserts, sql), - "hive.%s.test_s3_select_pushdown".formatted(HiveQueryRunner.TPCH_SCHEMA), - "(id INT, bool_t BOOLEAN, tiny_t TINYINT, small_t SMALLINT, int_t INT, big_t BIGINT, string_t VARCHAR, date_t DATE) " + - "WITH (external_location = 's3://" + bucket + "/test_s3_select_pushdown/test_table_" + randomNameSuffix() + "', " + tableProperties + ")", values)) { - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t = true", "VALUES 1, 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t = false", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t = 22", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t != 22", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t > 22", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t >= 22", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t = 22 OR tiny_t = 44", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NULL OR tiny_t >= 22", "VALUES 2, 3, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t = 222", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t != 222", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t > 222", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t >= 222", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t = 222 OR small_t = 444", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NULL OR small_t >= 222", "VALUES 2, 3, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t = 2222", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t != 2222", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t > 2222", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t >= 2222", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t = 2222 OR int_t = 4444", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NULL OR int_t >= 2222", "VALUES 2, 3, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t = 22222", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t != 22222", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t > 22222", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t >= 22222", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t = 22222 OR big_t = 44444", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NULL OR big_t >= 22222", "VALUES 2, 3, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t = 'two'", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t != 'two'", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t < 'two'", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t <= 'two'", "VALUES 1, 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t = 'two' OR string_t = ''", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NULL OR string_t >= 'two'", "VALUES 2, 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NOT NULL", "VALUES 1, 2, 4"); - - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t = DATE '2020-02-02'", "VALUES 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t != DATE '2020-02-02'", "VALUES 1, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t > DATE '2020-02-02'", "VALUES 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t <= DATE '2020-02-02'", "VALUES 1, 2"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t = DATE '2020-02-02' OR date_t = DATE '2020-04-04'", "VALUES 2, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NULL OR date_t >= DATE '2020-02-02'", "VALUES 2, 3, 4"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NULL", "VALUES 3"); - assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NOT NULL", "VALUES 1, 2, 4"); - } - } - - private void assertS3SelectQuery(@Language("SQL") String query, @Language("SQL") String expectedValues) - { - Session withS3SelectPushdown = Session.builder(getSession()) - .setCatalogSessionProperty("hive", "s3_select_pushdown_enabled", "true") - .setCatalogSessionProperty("hive", "json_native_reader_enabled", "false") - .setCatalogSessionProperty("hive", "text_file_native_reader_enabled", "false") - .build(); - - MaterializedResult expectedResult = computeActual(expectedValues); - assertQueryStats( - withS3SelectPushdown, - query, - statsWithPushdown -> { - long inputPositionsWithPushdown = statsWithPushdown.getPhysicalInputPositions(); - assertQueryStats( - getSession(), - query, - statsWithoutPushdown -> assertThat(statsWithoutPushdown.getPhysicalInputPositions()).isGreaterThan(inputPositionsWithPushdown), - results -> assertEquals(results.getOnlyColumnAsSet(), expectedResult.getOnlyColumnAsSet())); - }, - results -> assertEquals(results.getOnlyColumnAsSet(), expectedResult.getOnlyColumnAsSet())); - } - - @DataProvider - public static Object[][] s3SelectFileFormats() - { - return new Object[][] { - {"format = 'JSON'"}, - {"format = 'TEXTFILE', textfile_field_separator=',', textfile_field_separator_escape='|', null_format='~'"} - }; - } -} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestIonSqlQueryBuilder.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestIonSqlQueryBuilder.java deleted file mode 100644 index bb29743101532..0000000000000 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestIonSqlQueryBuilder.java +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import io.airlift.slice.Slices; -import io.trino.plugin.hive.HiveColumnHandle; -import io.trino.plugin.hive.HiveType; -import io.trino.spi.predicate.Domain; -import io.trino.spi.predicate.Range; -import io.trino.spi.predicate.SortedRangeSet; -import io.trino.spi.predicate.TupleDomain; -import io.trino.spi.predicate.ValueSet; -import io.trino.spi.type.DecimalType; -import io.trino.spi.type.TypeManager; -import io.trino.util.DateTimeUtils; -import org.testng.annotations.Test; - -import java.util.List; -import java.util.Optional; - -import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; -import static io.trino.plugin.hive.HiveColumnHandle.createBaseColumn; -import static io.trino.plugin.hive.HiveTestUtils.longDecimal; -import static io.trino.plugin.hive.HiveTestUtils.shortDecimal; -import static io.trino.plugin.hive.HiveType.HIVE_DATE; -import static io.trino.plugin.hive.HiveType.HIVE_DOUBLE; -import static io.trino.plugin.hive.HiveType.HIVE_INT; -import static io.trino.plugin.hive.HiveType.HIVE_STRING; -import static io.trino.plugin.hive.HiveType.HIVE_TIMESTAMP; -import static io.trino.spi.predicate.TupleDomain.withColumnDomains; -import static io.trino.spi.predicate.ValueSet.ofRanges; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.spi.type.DateType.DATE; -import static io.trino.spi.type.DoubleType.DOUBLE; -import static io.trino.spi.type.IntegerType.INTEGER; -import static io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS; -import static io.trino.spi.type.VarcharType.VARCHAR; -import static io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER; -import static org.testng.Assert.assertEquals; - -public class TestIonSqlQueryBuilder -{ - @Test - public void testBuildSQL() - { - List columns = ImmutableList.of( - createBaseColumn("n_nationkey", 0, HIVE_INT, INTEGER, REGULAR, Optional.empty()), - createBaseColumn("n_name", 1, HIVE_STRING, VARCHAR, REGULAR, Optional.empty()), - createBaseColumn("n_regionkey", 2, HIVE_INT, INTEGER, REGULAR, Optional.empty())); - - // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV, Optional.empty()); - assertEquals(queryBuilder.buildSql(columns, TupleDomain.all()), - "SELECT s._1, s._2, s._3 FROM S3Object s"); - - TupleDomain tupleDomain = withColumnDomains(ImmutableMap.of( - columns.get(2), Domain.create(SortedRangeSet.copyOf(BIGINT, ImmutableList.of(Range.equal(BIGINT, 3L))), false))); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), - "SELECT s._1, s._2, s._3 FROM S3Object s WHERE (s._3 != '' AND CAST(s._3 AS INT) = 3)"); - - // JSON - queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON, Optional.empty()); - assertEquals(queryBuilder.buildSql(columns, TupleDomain.all()), - "SELECT s.n_nationkey, s.n_name, s.n_regionkey FROM S3Object s"); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), - "SELECT s.n_nationkey, s.n_name, s.n_regionkey FROM S3Object s WHERE (s.n_regionkey IS NOT NULL AND CAST(s.n_regionkey AS INT) = 3)"); - } - - @Test - public void testEmptyColumns() - { - // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV, Optional.empty()); - assertEquals(queryBuilder.buildSql(ImmutableList.of(), TupleDomain.all()), "SELECT ' ' FROM S3Object s"); - - // JSON - queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON, Optional.empty()); - assertEquals(queryBuilder.buildSql(ImmutableList.of(), TupleDomain.all()), "SELECT ' ' FROM S3Object s"); - } - - @Test - public void testDecimalColumns() - { - TypeManager typeManager = TESTING_TYPE_MANAGER; - List columns = ImmutableList.of( - createBaseColumn("quantity", 0, HiveType.valueOf("decimal(20,0)"), DecimalType.createDecimalType(), REGULAR, Optional.empty()), - createBaseColumn("extendedprice", 1, HiveType.valueOf("decimal(20,2)"), DecimalType.createDecimalType(), REGULAR, Optional.empty()), - createBaseColumn("discount", 2, HiveType.valueOf("decimal(10,2)"), DecimalType.createDecimalType(), REGULAR, Optional.empty())); - DecimalType decimalType = DecimalType.createDecimalType(10, 2); - TupleDomain tupleDomain = withColumnDomains( - ImmutableMap.of( - columns.get(0), Domain.create(ofRanges(Range.lessThan(DecimalType.createDecimalType(20, 0), longDecimal("50"))), false), - columns.get(1), Domain.create(ofRanges(Range.equal(HiveType.valueOf("decimal(20,2)").getType(typeManager), longDecimal("0.05"))), false), - columns.get(2), Domain.create(ofRanges(Range.range(decimalType, shortDecimal("0.0"), true, shortDecimal("0.02"), true)), false))); - - // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager, S3SelectDataType.CSV, Optional.empty()); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s._1, s._2, s._3 FROM S3Object s"); - - // JSON - queryBuilder = new IonSqlQueryBuilder(typeManager, S3SelectDataType.JSON, Optional.empty()); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s.quantity, s.extendedprice, s.discount FROM S3Object s"); - } - - @Test - public void testDateColumn() - { - List columns = ImmutableList.of( - createBaseColumn("t1", 0, HIVE_TIMESTAMP, TIMESTAMP_MILLIS, REGULAR, Optional.empty()), - createBaseColumn("t2", 1, HIVE_DATE, DATE, REGULAR, Optional.empty())); - TupleDomain tupleDomain = withColumnDomains(ImmutableMap.of( - columns.get(1), Domain.create(SortedRangeSet.copyOf(DATE, ImmutableList.of(Range.equal(DATE, (long) DateTimeUtils.parseDate("2001-08-22")))), false))); - - // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV, Optional.empty()); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s._1, s._2 FROM S3Object s WHERE (s._2 != '' AND s._2 = '2001-08-22')"); - - // JSON - queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON, Optional.empty()); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s.t1, s.t2 FROM S3Object s WHERE (s.t2 IS NOT NULL AND s.t2 = '2001-08-22')"); - } - - @Test - public void testNotPushDoublePredicates() - { - List columns = ImmutableList.of( - createBaseColumn("quantity", 0, HIVE_INT, INTEGER, REGULAR, Optional.empty()), - createBaseColumn("extendedprice", 1, HIVE_DOUBLE, DOUBLE, REGULAR, Optional.empty()), - createBaseColumn("discount", 2, HIVE_DOUBLE, DOUBLE, REGULAR, Optional.empty())); - TupleDomain tupleDomain = withColumnDomains( - ImmutableMap.of( - columns.get(0), Domain.create(ofRanges(Range.lessThan(BIGINT, 50L)), false), - columns.get(1), Domain.create(ofRanges(Range.equal(DOUBLE, 0.05)), false), - columns.get(2), Domain.create(ofRanges(Range.range(DOUBLE, 0.0, true, 0.02, true)), false))); - - // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV, Optional.empty()); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s._1, s._2, s._3 FROM S3Object s WHERE (s._1 != '' AND CAST(s._1 AS INT) < 50)"); - - // JSON - queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON, Optional.empty()); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s.quantity, s.extendedprice, s.discount FROM S3Object s WHERE (s.quantity IS NOT NULL AND CAST(s.quantity AS INT) < 50)"); - } - - @Test - public void testStringEscaping() - { - List columns = ImmutableList.of( - createBaseColumn("string", 0, HIVE_STRING, VARCHAR, REGULAR, Optional.empty())); - TupleDomain tupleDomain = withColumnDomains(ImmutableMap.of( - columns.get(0), - Domain.create(ValueSet.of(VARCHAR, Slices.utf8Slice("value with a ' quote")), false))); - - // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV, Optional.empty()); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s._1 FROM S3Object s WHERE (s._1 != '' AND s._1 = 'value with a '' quote')"); - - // JSON - queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON, Optional.empty()); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s.string FROM S3Object s WHERE (s.string IS NOT NULL AND s.string = 'value with a '' quote')"); - } -} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestS3SelectPushdown.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestS3SelectPushdown.java deleted file mode 100644 index ceb44e81b4ad2..0000000000000 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestS3SelectPushdown.java +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import io.trino.plugin.hive.metastore.Column; -import io.trino.plugin.hive.metastore.Partition; -import io.trino.plugin.hive.metastore.Storage; -import io.trino.plugin.hive.metastore.StorageFormat; -import io.trino.plugin.hive.metastore.Table; -import io.trino.spi.connector.ConnectorSession; -import io.trino.testing.TestingConnectorSession; -import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.hive.hcatalog.data.JsonSerDe; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.OptionalLong; -import java.util.Properties; - -import static io.trino.hive.thrift.metastore.hive_metastoreConstants.FILE_INPUT_FORMAT; -import static io.trino.plugin.hive.HiveMetadata.SKIP_FOOTER_COUNT_KEY; -import static io.trino.plugin.hive.HiveMetadata.SKIP_HEADER_COUNT_KEY; -import static io.trino.plugin.hive.HiveStorageFormat.ORC; -import static io.trino.plugin.hive.HiveStorageFormat.TEXTFILE; -import static io.trino.plugin.hive.HiveType.HIVE_BINARY; -import static io.trino.plugin.hive.HiveType.HIVE_BOOLEAN; -import static io.trino.plugin.hive.metastore.MetastoreUtil.getHiveSchema; -import static io.trino.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat; -import static io.trino.plugin.hive.s3select.S3SelectPushdown.isCompressionCodecSupported; -import static io.trino.plugin.hive.s3select.S3SelectPushdown.isSplittable; -import static io.trino.plugin.hive.s3select.S3SelectPushdown.shouldEnablePushdownForTable; -import static io.trino.spi.session.PropertyMetadata.booleanProperty; -import static java.util.Collections.emptyList; -import static java.util.Collections.emptyMap; -import static java.util.Collections.singletonList; -import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; - -public class TestS3SelectPushdown -{ - private static final String S3_SELECT_PUSHDOWN_ENABLED = "s3_select_pushdown_enabled"; - - private ConnectorSession session; - private Table table; - private Partition partition; - private Storage storage; - private Column column; - private Properties schema; - - @BeforeClass - public void setUp() - { - session = TestingConnectorSession.builder() - .setPropertyMetadata(List.of(booleanProperty( - S3_SELECT_PUSHDOWN_ENABLED, - "S3 Select pushdown enabled", - true, - false))) - .setPropertyValues(Map.of(S3_SELECT_PUSHDOWN_ENABLED, true)) - .build(); - - column = new Column("column", HIVE_BOOLEAN, Optional.empty()); - - storage = Storage.builder() - .setStorageFormat(fromHiveStorageFormat(TEXTFILE)) - .setLocation("location") - .build(); - - partition = new Partition( - "db", - "table", - emptyList(), - storage, - singletonList(column), - emptyMap()); - - table = new Table( - "db", - "table", - Optional.of("owner"), - "type", - storage, - singletonList(column), - emptyList(), - emptyMap(), - Optional.empty(), - Optional.empty(), - OptionalLong.empty()); - - schema = getHiveSchema(partition, table); - } - - @Test - public void testIsCompressionCodecSupported() - { - assertTrue(isCompressionCodecSupported(schema, "s3://fakeBucket/fakeObject.gz")); - assertTrue(isCompressionCodecSupported(schema, "s3://fakeBucket/fakeObject")); - assertFalse(isCompressionCodecSupported(schema, "s3://fakeBucket/fakeObject.lz4")); - assertFalse(isCompressionCodecSupported(schema, "s3://fakeBucket/fakeObject.snappy")); - assertTrue(isCompressionCodecSupported(schema, "s3://fakeBucket/fakeObject.bz2")); - } - - @Test - public void testShouldEnableSelectPushdown() - { - assertTrue(shouldEnablePushdownForTable(session, table, "s3://fakeBucket/fakeObject", Optional.empty())); - assertTrue(shouldEnablePushdownForTable(session, table, "s3://fakeBucket/fakeObject", Optional.of(partition))); - } - - @Test - public void testShouldNotEnableSelectPushdownWhenDisabledOnSession() - { - ConnectorSession testSession = TestingConnectorSession.builder() - .setPropertyMetadata(List.of(booleanProperty( - S3_SELECT_PUSHDOWN_ENABLED, - "S3 Select pushdown enabled", - false, - false))) - .setPropertyValues(Map.of(S3_SELECT_PUSHDOWN_ENABLED, false)) - .build(); - assertFalse(shouldEnablePushdownForTable(testSession, table, "", Optional.empty())); - } - - @Test - public void testShouldNotEnableSelectPushdownWhenIsNotS3StoragePath() - { - assertFalse(shouldEnablePushdownForTable(session, table, null, Optional.empty())); - assertFalse(shouldEnablePushdownForTable(session, table, "", Optional.empty())); - assertFalse(shouldEnablePushdownForTable(session, table, "s3:/invalid", Optional.empty())); - assertFalse(shouldEnablePushdownForTable(session, table, "s3:/invalid", Optional.of(partition))); - } - - @Test - public void testShouldNotEnableSelectPushdownWhenIsNotSupportedSerde() - { - Storage newStorage = Storage.builder() - .setStorageFormat(fromHiveStorageFormat(ORC)) - .setLocation("location") - .build(); - Table newTable = new Table( - "db", - "table", - Optional.of("owner"), - "type", - newStorage, - singletonList(column), - emptyList(), - emptyMap(), - Optional.empty(), - Optional.empty(), - OptionalLong.empty()); - - assertFalse(shouldEnablePushdownForTable(session, newTable, "s3://fakeBucket/fakeObject", Optional.empty())); - - Partition newPartition = new Partition("db", - "table", - emptyList(), - newStorage, - singletonList(column), - emptyMap()); - assertFalse(shouldEnablePushdownForTable(session, newTable, "s3://fakeBucket/fakeObject", Optional.of(newPartition))); - } - - @Test - public void testShouldNotEnableSelectPushdownWhenIsNotSupportedInputFormat() - { - Storage newStorage = Storage.builder() - .setStorageFormat(StorageFormat.create(LazySimpleSerDe.class.getName(), "inputFormat", "outputFormat")) - .setLocation("location") - .build(); - Table newTable = new Table("db", - "table", - Optional.of("owner"), - "type", - newStorage, - singletonList(column), - emptyList(), - emptyMap(), - Optional.empty(), - Optional.empty(), - OptionalLong.empty()); - assertFalse(shouldEnablePushdownForTable(session, newTable, "s3://fakeBucket/fakeObject", Optional.empty())); - - Partition newPartition = new Partition("db", - "table", - emptyList(), - newStorage, - singletonList(column), - emptyMap()); - assertFalse(shouldEnablePushdownForTable(session, newTable, "s3://fakeBucket/fakeObject", Optional.of(newPartition))); - - newStorage = Storage.builder() - .setStorageFormat(StorageFormat.create(LazySimpleSerDe.class.getName(), TextInputFormat.class.getName(), "outputFormat")) - .setLocation("location") - .build(); - newTable = new Table("db", - "table", - Optional.of("owner"), - "type", - newStorage, - singletonList(column), - emptyList(), - Map.of(SKIP_HEADER_COUNT_KEY, "1"), - Optional.empty(), - Optional.empty(), - OptionalLong.empty()); - assertFalse(shouldEnablePushdownForTable(session, newTable, "s3://fakeBucket/fakeObject", Optional.empty())); - - newTable = new Table("db", - "table", - Optional.of("owner"), - "type", - newStorage, - singletonList(column), - emptyList(), - Map.of(SKIP_FOOTER_COUNT_KEY, "1"), - Optional.empty(), - Optional.empty(), - OptionalLong.empty()); - assertFalse(shouldEnablePushdownForTable(session, newTable, "s3://fakeBucket/fakeObject", Optional.empty())); - } - - @Test - public void testShouldNotEnableSelectPushdownWhenColumnTypesAreNotSupported() - { - Column newColumn = new Column("column", HIVE_BINARY, Optional.empty()); - Table newTable = new Table("db", - "table", - Optional.of("owner"), - "type", - storage, - singletonList(newColumn), - emptyList(), - emptyMap(), - Optional.empty(), - Optional.empty(), - OptionalLong.empty()); - assertFalse(shouldEnablePushdownForTable(session, newTable, "s3://fakeBucket/fakeObject", Optional.empty())); - - Partition newPartition = new Partition("db", - "table", - emptyList(), - storage, - singletonList(newColumn), - emptyMap()); - assertFalse(shouldEnablePushdownForTable(session, newTable, "s3://fakeBucket/fakeObject", Optional.of(newPartition))); - } - - @Test - public void testShouldEnableSplits() - { - // Uncompressed CSV - assertTrue(isSplittable(true, schema, "s3://fakeBucket/fakeObject.csv")); - // Pushdown disabled - assertTrue(isSplittable(false, schema, "s3://fakeBucket/fakeObject.csv")); - // JSON - Properties jsonSchema = new Properties(); - jsonSchema.setProperty(FILE_INPUT_FORMAT, TextInputFormat.class.getName()); - jsonSchema.setProperty(SERIALIZATION_LIB, JsonSerDe.class.getName()); - assertTrue(isSplittable(true, jsonSchema, "s3://fakeBucket/fakeObject.json")); - } - - @Test - public void testShouldNotEnableSplits() - { - // Compressed file - assertFalse(isSplittable(true, schema, "s3://fakeBucket/fakeObject.gz")); - } - - @AfterClass(alwaysRun = true) - public void tearDown() - { - session = null; - table = null; - partition = null; - storage = null; - column = null; - schema = null; - } -} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestS3SelectRecordCursorProvider.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestS3SelectRecordCursorProvider.java deleted file mode 100644 index b7f263575ff36..0000000000000 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestS3SelectRecordCursorProvider.java +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import io.trino.filesystem.Location; -import io.trino.hadoop.ConfigurationInstantiator; -import io.trino.plugin.hive.HiveColumnHandle; -import io.trino.plugin.hive.HiveConfig; -import io.trino.plugin.hive.HiveRecordCursorProvider.ReaderRecordCursorWithProjections; -import io.trino.plugin.hive.TestBackgroundHiveSplitLoader.TestingHdfsEnvironment; -import io.trino.spi.predicate.Domain; -import io.trino.spi.predicate.Range; -import io.trino.spi.predicate.SortedRangeSet; -import io.trino.spi.predicate.TupleDomain; -import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; -import org.apache.hive.hcatalog.data.JsonSerDe; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.Properties; -import java.util.function.Function; -import java.util.stream.Collectors; - -import static io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; -import static io.trino.plugin.hive.HiveColumnHandle.createBaseColumn; -import static io.trino.plugin.hive.HiveTestUtils.SESSION; -import static io.trino.plugin.hive.HiveType.HIVE_INT; -import static io.trino.plugin.hive.HiveType.HIVE_STRING; -import static io.trino.spi.predicate.TupleDomain.withColumnDomains; -import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.spi.type.DateType.DATE; -import static io.trino.spi.type.IntegerType.INTEGER; -import static io.trino.spi.type.VarcharType.VARCHAR; -import static io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER; -import static org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMNS; -import static org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMN_TYPES; -import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; -import static org.testng.Assert.assertTrue; - -public class TestS3SelectRecordCursorProvider -{ - private static final HiveColumnHandle ARTICLE_COLUMN = createBaseColumn("article", 1, HIVE_STRING, VARCHAR, REGULAR, Optional.empty()); - private static final HiveColumnHandle AUTHOR_COLUMN = createBaseColumn("author", 1, HIVE_STRING, VARCHAR, REGULAR, Optional.empty()); - private static final HiveColumnHandle DATE_ARTICLE_COLUMN = createBaseColumn("date_pub", 1, HIVE_INT, DATE, REGULAR, Optional.empty()); - private static final HiveColumnHandle QUANTITY_COLUMN = createBaseColumn("quantity", 1, HIVE_INT, INTEGER, REGULAR, Optional.empty()); - - @Test - public void shouldReturnSelectRecordCursor() - { - List readerColumns = new ArrayList<>(); - TupleDomain effectivePredicate = TupleDomain.all(); - Optional recordCursor = - getRecordCursor(effectivePredicate, readerColumns, true); - assertTrue(recordCursor.isPresent()); - } - - @Test - public void shouldReturnSelectRecordCursorWhenEffectivePredicateExists() - { - TupleDomain effectivePredicate = withColumnDomains(ImmutableMap.of(QUANTITY_COLUMN, - Domain.create(SortedRangeSet.copyOf(BIGINT, ImmutableList.of(Range.equal(BIGINT, 3L))), false))); - Optional recordCursor = - getRecordCursor(effectivePredicate, getAllColumns(), true); - assertTrue(recordCursor.isPresent()); - } - - @Test - public void shouldReturnSelectRecordCursorWhenProjectionExists() - { - TupleDomain effectivePredicate = TupleDomain.all(); - List readerColumns = ImmutableList.of(QUANTITY_COLUMN, AUTHOR_COLUMN, ARTICLE_COLUMN); - Optional recordCursor = - getRecordCursor(effectivePredicate, readerColumns, true); - assertTrue(recordCursor.isPresent()); - } - - @Test - public void shouldNotReturnSelectRecordCursorWhenPushdownIsDisabled() - { - List readerColumns = new ArrayList<>(); - TupleDomain effectivePredicate = TupleDomain.all(); - Optional recordCursor = - getRecordCursor(effectivePredicate, readerColumns, false); - assertTrue(recordCursor.isEmpty()); - } - - @Test - public void shouldNotReturnSelectRecordCursorWhenQueryIsNotFiltering() - { - TupleDomain effectivePredicate = TupleDomain.all(); - Optional recordCursor = - getRecordCursor(effectivePredicate, getAllColumns(), true); - assertTrue(recordCursor.isEmpty()); - } - - @Test - public void shouldNotReturnSelectRecordCursorWhenProjectionOrderIsDifferent() - { - TupleDomain effectivePredicate = TupleDomain.all(); - List readerColumns = ImmutableList.of(DATE_ARTICLE_COLUMN, QUANTITY_COLUMN, ARTICLE_COLUMN, AUTHOR_COLUMN); - Optional recordCursor = - getRecordCursor(effectivePredicate, readerColumns, true); - assertTrue(recordCursor.isEmpty()); - } - - @Test - public void testDisableExperimentalFeatures() - { - List readerColumns = new ArrayList<>(); - TupleDomain effectivePredicate = TupleDomain.all(); - S3SelectRecordCursorProvider s3SelectRecordCursorProvider = new S3SelectRecordCursorProvider( - new TestingHdfsEnvironment(new ArrayList<>()), - new TrinoS3ClientFactory(new HiveConfig()), - new HiveConfig().setS3SelectExperimentalPushdownEnabled(false)); - - Optional csvRecordCursor = s3SelectRecordCursorProvider.createRecordCursor( - ConfigurationInstantiator.newEmptyConfiguration(), - SESSION, - Location.of("s3://fakeBucket/fakeObject.gz"), - 0, - 10, - 10, - createTestingSchema(LazySimpleSerDe.class.getName()), - readerColumns, - effectivePredicate, - TESTING_TYPE_MANAGER, - true); - assertTrue(csvRecordCursor.isEmpty()); - - Optional jsonRecordCursor = s3SelectRecordCursorProvider.createRecordCursor( - ConfigurationInstantiator.newEmptyConfiguration(), - SESSION, - Location.of("s3://fakeBucket/fakeObject.gz"), - 0, - 10, - 10, - createTestingSchema(JsonSerDe.class.getName()), - readerColumns, - effectivePredicate, - TESTING_TYPE_MANAGER, - true); - assertTrue(jsonRecordCursor.isPresent()); - } - - private static Optional getRecordCursor(TupleDomain effectivePredicate, - List readerColumns, - boolean s3SelectPushdownEnabled) - { - S3SelectRecordCursorProvider s3SelectRecordCursorProvider = new S3SelectRecordCursorProvider( - new TestingHdfsEnvironment(new ArrayList<>()), - new TrinoS3ClientFactory(new HiveConfig()), - new HiveConfig().setS3SelectExperimentalPushdownEnabled(true)); - - return s3SelectRecordCursorProvider.createRecordCursor( - ConfigurationInstantiator.newEmptyConfiguration(), - SESSION, - Location.of("s3://fakeBucket/fakeObject.gz"), - 0, - 10, - 10, - createTestingSchema(), - readerColumns, - effectivePredicate, - TESTING_TYPE_MANAGER, - s3SelectPushdownEnabled); - } - - private static Properties createTestingSchema() - { - return createTestingSchema(LazySimpleSerDe.class.getName()); - } - - private static Properties createTestingSchema(String serdeClassName) - { - List schemaColumns = getAllColumns(); - Properties schema = new Properties(); - String columnNames = buildPropertyFromColumns(schemaColumns, HiveColumnHandle::getName); - String columnTypeNames = buildPropertyFromColumns(schemaColumns, column -> column.getHiveType().getTypeInfo().getTypeName()); - schema.setProperty(LIST_COLUMNS, columnNames); - schema.setProperty(LIST_COLUMN_TYPES, columnTypeNames); - schema.setProperty(SERIALIZATION_LIB, serdeClassName); - return schema; - } - - private static String buildPropertyFromColumns(List columns, Function mapper) - { - if (columns.isEmpty()) { - return ""; - } - return columns.stream() - .map(mapper) - .collect(Collectors.joining(",")); - } - - private static List getAllColumns() - { - return ImmutableList.of(ARTICLE_COLUMN, AUTHOR_COLUMN, DATE_ARTICLE_COLUMN, QUANTITY_COLUMN); - } -} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestUnrecoverableS3OperationException.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestUnrecoverableS3OperationException.java deleted file mode 100644 index 8bc3abadbea56..0000000000000 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestUnrecoverableS3OperationException.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.s3select; - -import org.testng.annotations.Test; - -import java.io.IOException; - -import static io.trino.plugin.hive.s3select.S3SelectLineRecordReader.UnrecoverableS3OperationException; -import static org.assertj.core.api.Assertions.assertThat; - -public class TestUnrecoverableS3OperationException -{ - @Test - public void testMessage() - { - assertThat(new UnrecoverableS3OperationException("test-bucket", "test-key", new IOException("test io exception"))) - .hasMessage("java.io.IOException: test io exception (Bucket: test-bucket, Key: test-key)"); - } -} diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CompressionConfigUtil.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/CompressionConfigUtil.java similarity index 96% rename from plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CompressionConfigUtil.java rename to plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/CompressionConfigUtil.java index eaea72e216829..368796d108f31 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/CompressionConfigUtil.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/CompressionConfigUtil.java @@ -15,7 +15,6 @@ import io.trino.hive.orc.OrcConf; import io.trino.plugin.hive.HiveCompressionCodec; -import org.apache.avro.mapred.AvroJob; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.parquet.hadoop.ParquetOutputFormat; @@ -51,7 +50,7 @@ public static void configureCompression(Configuration config, HiveCompressionCod config.set(ParquetOutputFormat.COMPRESSION, compressionCodec.getParquetCompressionCodec().name()); // For Avro - compressionCodec.getAvroCompressionKind().ifPresent(kind -> config.set(AvroJob.OUTPUT_CODEC, kind.toString())); + compressionCodec.getAvroCompressionKind().ifPresent(kind -> config.set("avro.output.codec", kind.toString())); // For SequenceFile config.set(FileOutputFormat.COMPRESS_TYPE, BLOCK.toString()); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/DecimalUtils.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/DecimalUtils.java similarity index 100% rename from plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/DecimalUtils.java rename to plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/DecimalUtils.java diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/SerDeUtils.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/SerDeUtils.java similarity index 100% rename from plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/SerDeUtils.java rename to plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/SerDeUtils.java diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestCompressionConfigUtil.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestCompressionConfigUtil.java deleted file mode 100644 index be8d69d136919..0000000000000 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestCompressionConfigUtil.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.util; - -import io.trino.plugin.hive.HiveCompressionCodec; -import org.apache.hadoop.conf.Configuration; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; - -import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; - -public class TestCompressionConfigUtil -{ - @Test(dataProvider = "compressionCodes") - public void testAssertCompressionConfigured(HiveCompressionCodec compressionCodec) - { - Configuration config = newEmptyConfiguration(); - CompressionConfigUtil.configureCompression(config, compressionCodec); - } - - @DataProvider - public Object[][] compressionCodes() - { - return Arrays.stream(HiveCompressionCodec.values()) - .map(codec -> new Object[] {codec}) - .toArray(Object[][]::new); - } -} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestForwardingRecordCursor.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestForwardingRecordCursor.java deleted file mode 100644 index 019ea8528bfd1..0000000000000 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestForwardingRecordCursor.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.trino.plugin.hive.util; - -import io.trino.spi.connector.RecordCursor; -import org.testng.annotations.Test; - -import static io.trino.spi.testing.InterfaceTestUtils.assertAllMethodsOverridden; - -public class TestForwardingRecordCursor -{ - @Test - public void testAllMethodsOverridden() - { - assertAllMethodsOverridden(RecordCursor.class, ForwardingRecordCursor.class); - } -} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveUtil.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveUtil.java index 5ae8d4470c03e..11c55eaf77b5e 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveUtil.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveUtil.java @@ -13,17 +13,9 @@ */ package io.trino.plugin.hive.util; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat; -import org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat; -import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; -import org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer; -import org.apache.hadoop.hive.serde2.thrift.test.IntString; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.thrift.protocol.TBinaryProtocol; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; @@ -32,24 +24,12 @@ import java.util.AbstractList; import java.util.ArrayList; import java.util.List; -import java.util.Properties; - -import static io.airlift.testing.Assertions.assertInstanceOf; -import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; -import static io.trino.plugin.hive.HiveStorageFormat.AVRO; -import static io.trino.plugin.hive.HiveStorageFormat.PARQUET; -import static io.trino.plugin.hive.HiveStorageFormat.SEQUENCEFILE; -import static io.trino.plugin.hive.util.HiveReaderUtil.getDeserializer; -import static io.trino.plugin.hive.util.HiveReaderUtil.getInputFormat; + import static io.trino.plugin.hive.util.HiveUtil.escapeSchemaName; import static io.trino.plugin.hive.util.HiveUtil.escapeTableName; import static io.trino.plugin.hive.util.HiveUtil.parseHiveTimestamp; import static io.trino.plugin.hive.util.HiveUtil.toPartitionValues; import static io.trino.type.DateTimes.MICROSECONDS_PER_MILLISECOND; -import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; -import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_CLASS; -import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT; -import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.testng.Assert.assertEquals; @@ -67,17 +47,6 @@ public void testParseHiveTimestamp() assertEquals(parse(time, "yyyy-MM-dd HH:mm:ss.SSSSSSSSS"), unixTime(time, 7)); } - @Test - public void testGetThriftDeserializer() - { - Properties schema = new Properties(); - schema.setProperty(SERIALIZATION_LIB, ThriftDeserializer.class.getName()); - schema.setProperty(SERIALIZATION_CLASS, IntString.class.getName()); - schema.setProperty(SERIALIZATION_FORMAT, TBinaryProtocol.class.getName()); - - assertInstanceOf(getDeserializer(newEmptyConfiguration(), schema), ThriftDeserializer.class); - } - @Test public void testToPartitionValues() throws MetaException @@ -90,37 +59,6 @@ public void testToPartitionValues() assertToPartitionValues("pk=__HIVE_DEFAULT_PARTITION__"); } - @Test - public void testGetInputFormat() - { - Configuration configuration = newEmptyConfiguration(); - - // LazySimpleSerDe is used by TEXTFILE and SEQUENCEFILE. getInputFormat should default to TEXTFILE - // per Hive spec. - Properties sequenceFileSchema = new Properties(); - sequenceFileSchema.setProperty(FILE_INPUT_FORMAT, SymlinkTextInputFormat.class.getName()); - sequenceFileSchema.setProperty(SERIALIZATION_LIB, SEQUENCEFILE.getSerde()); - assertInstanceOf(getInputFormat(configuration, sequenceFileSchema), TextInputFormat.class); - - Properties avroSymlinkSchema = new Properties(); - avroSymlinkSchema.setProperty(FILE_INPUT_FORMAT, SymlinkTextInputFormat.class.getName()); - avroSymlinkSchema.setProperty(SERIALIZATION_LIB, AVRO.getSerde()); - assertInstanceOf(getInputFormat(configuration, avroSymlinkSchema), AvroContainerInputFormat.class); - - Properties parquetSymlinkSchema = new Properties(); - parquetSymlinkSchema.setProperty(FILE_INPUT_FORMAT, SymlinkTextInputFormat.class.getName()); - parquetSymlinkSchema.setProperty(SERIALIZATION_LIB, PARQUET.getSerde()); - assertInstanceOf(getInputFormat(configuration, parquetSymlinkSchema), MapredParquetInputFormat.class); - - Properties parquetSchema = new Properties(); - parquetSchema.setProperty(FILE_INPUT_FORMAT, PARQUET.getInputFormat()); - assertInstanceOf(getInputFormat(configuration, parquetSchema), MapredParquetInputFormat.class); - - Properties legacyParquetSchema = new Properties(); - legacyParquetSchema.setProperty(FILE_INPUT_FORMAT, "parquet.hive.MapredParquetInputFormat"); - assertInstanceOf(getInputFormat(configuration, legacyParquetSchema), MapredParquetInputFormat.class); - } - @Test public void testUnescapePathName() { diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveWriteUtils.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveWriteUtils.java index 2bb269ed70c93..c633734a4b0b6 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveWriteUtils.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/util/TestHiveWriteUtils.java @@ -14,14 +14,29 @@ package io.trino.plugin.hive.util; import io.trino.hdfs.HdfsContext; +import io.trino.spi.Page; +import io.trino.spi.PageBuilder; +import io.trino.spi.block.BlockBuilder; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.SqlDecimal; +import io.trino.spi.type.Type; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.type.HiveDecimal; import org.testng.annotations.Test; +import java.util.List; + import static io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT; +import static io.trino.plugin.hive.util.HiveWriteUtils.createPartitionValues; import static io.trino.plugin.hive.util.HiveWriteUtils.isS3FileSystem; import static io.trino.plugin.hive.util.HiveWriteUtils.isViewFileSystem; +import static io.trino.spi.type.DecimalType.createDecimalType; +import static io.trino.spi.type.Decimals.writeBigDecimal; +import static io.trino.spi.type.Decimals.writeShortDecimal; +import static io.trino.spi.type.SqlDecimal.decimal; import static io.trino.testing.TestingConnectorSession.SESSION; import static io.trino.testing.TestingNames.randomNameSuffix; +import static org.assertj.core.api.Assertions.assertThat; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; @@ -49,4 +64,49 @@ public void testIsViewFileSystem() assertTrue(isViewFileSystem(CONTEXT, HDFS_ENVIRONMENT, viewfsPath)); assertFalse(isViewFileSystem(CONTEXT, HDFS_ENVIRONMENT, nonViewfsPath)); } + + @Test + public void testCreatePartitionValuesDecimal() + { + assertCreatePartitionValuesDecimal(10, 0, "12345", "12345"); + assertCreatePartitionValuesDecimal(10, 2, "123.45", "123.45"); + assertCreatePartitionValuesDecimal(10, 2, "12345.00", "12345"); + assertCreatePartitionValuesDecimal(5, 0, "12345", "12345"); + assertCreatePartitionValuesDecimal(38, 2, "12345.00", "12345"); + assertCreatePartitionValuesDecimal(38, 20, "12345.00000000000000000000", "12345"); + assertCreatePartitionValuesDecimal(38, 20, "12345.67898000000000000000", "12345.67898"); + } + + private static void assertCreatePartitionValuesDecimal(int precision, int scale, String decimalValue, String expectedValue) + { + DecimalType decimalType = createDecimalType(precision, scale); + List types = List.of(decimalType); + SqlDecimal decimal = decimal(decimalValue, decimalType); + + // verify the test values are as expected + assertThat(decimal.toString()).isEqualTo(decimalValue); + assertThat(decimal.toBigDecimal().toString()).isEqualTo(decimalValue); + + PageBuilder pageBuilder = new PageBuilder(types); + pageBuilder.declarePosition(); + writeDecimal(decimalType, decimal, pageBuilder.getBlockBuilder(0)); + Page page = pageBuilder.build(); + + // verify the expected value against HiveDecimal + assertThat(HiveDecimal.create(decimal.toBigDecimal()).toString()) + .isEqualTo(expectedValue); + + assertThat(createPartitionValues(types, page, 0)) + .isEqualTo(List.of(expectedValue)); + } + + private static void writeDecimal(DecimalType decimalType, SqlDecimal decimal, BlockBuilder blockBuilder) + { + if (decimalType.isShort()) { + writeShortDecimal(blockBuilder, decimal.toBigDecimal().unscaledValue().longValue()); + } + else { + writeBigDecimal(decimalType, blockBuilder, decimal.toBigDecimal()); + } + } } diff --git a/plugin/trino-hive/src/main/java/org/apache/parquet/hadoop/DisabledMemoryManager.java b/plugin/trino-hive/src/test/java/org/apache/parquet/hadoop/DisabledMemoryManager.java similarity index 100% rename from plugin/trino-hive/src/main/java/org/apache/parquet/hadoop/DisabledMemoryManager.java rename to plugin/trino-hive/src/test/java/org/apache/parquet/hadoop/DisabledMemoryManager.java diff --git a/plugin/trino-hudi/pom.xml b/plugin/trino-hudi/pom.xml index 0512460aca906..59408435f0373 100644 --- a/plugin/trino-hudi/pom.xml +++ b/plugin/trino-hudi/pom.xml @@ -93,13 +93,6 @@ io.trino trino-hive - - - - io.trino.hive - hive-apache - - diff --git a/plugin/trino-iceberg/pom.xml b/plugin/trino-iceberg/pom.xml index 57c3f746c3724..d436a8062db23 100644 --- a/plugin/trino-iceberg/pom.xml +++ b/plugin/trino-iceberg/pom.xml @@ -143,11 +143,6 @@ io.airlift http-client - - - io.trino.hive - hive-apache - @@ -527,6 +522,12 @@ test + + io.trino.hive + hive-apache + test + + io.trino.tpch tpch