diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index 9eadf896888a5..4d5df9c45e7ac 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -109,6 +109,38 @@ jackson-databind test + + org.apache.arrow.orc + arrow-orc + ${project.version} + test + + + org.apache.orc + orc-core + 1.7.6 + test + + + log4j + log4j + + + org.slf4j + slf4j-log4j12 + + + commons-logging + commons-logging + + + + + org.apache.hive + hive-storage-api + 2.8.1 + test + diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index d088163903457..ef9178b1b5d4f 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -91,6 +91,8 @@ arrow::Result> GetFileFormat( return std::make_shared(); case 1: return std::make_shared(); + case 2: + return std::make_shared(); default: std::string error_message = "illegal file format id: " + std::to_string(file_format_id); diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java index 343e458ce23a9..b428b254b10ca 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java @@ -23,6 +23,7 @@ public enum FileFormat { PARQUET(0), ARROW_IPC(1), + ORC(2), NONE(-1); private final int id; diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/OrcWriteSupport.java b/java/dataset/src/test/java/org/apache/arrow/dataset/OrcWriteSupport.java new file mode 100644 index 0000000000000..c49612995ee1e --- /dev/null +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/OrcWriteSupport.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; + +public class OrcWriteSupport { + public static void writeTempFile(TypeDescription orcSchema, Path path, Integer[] values) throws IOException { + Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(new Configuration()).setSchema(orcSchema)); + VectorizedRowBatch batch = orcSchema.createRowBatch(); + LongColumnVector longColumnVector = (LongColumnVector) batch.cols[0]; + for (int idx = 0; idx < values.length; idx++) { + longColumnVector.vector[idx] = values[idx]; + } + batch.size = values.length; + writer.addRowBatch(batch); + writer.close(); + } +} diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java index 2fd8a19bac1f1..b8d51a3edb169 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java @@ -37,6 +37,7 @@ import java.util.concurrent.Executors; import java.util.stream.Collectors; +import org.apache.arrow.dataset.OrcWriteSupport; import org.apache.arrow.dataset.ParquetWriteSupport; import org.apache.arrow.dataset.jni.NativeDataset; import org.apache.arrow.dataset.jni.NativeInstanceReleasedException; @@ -59,6 +60,8 @@ import org.apache.arrow.vector.types.pojo.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecordBuilder; +import org.apache.hadoop.fs.Path; +import org.apache.orc.TypeDescription; import org.junit.Assert; import org.junit.ClassRule; import org.junit.Test; @@ -357,6 +360,34 @@ public void testBaseArrowIpcRead() throws Exception { AutoCloseables.close(factory); } + @Test + public void testBaseOrcRead() throws Exception { + String dataName = "test-orc"; + String basePath = TMP.getRoot().getAbsolutePath(); + + TypeDescription orcSchema = TypeDescription.fromString("struct"); + Path path = new Path(basePath, dataName); + OrcWriteSupport.writeTempFile(orcSchema, path, new Integer[]{Integer.MIN_VALUE, Integer.MAX_VALUE}); + + String orcDatasetUri = new File(basePath, dataName).toURI().toString(); + FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(), + FileFormat.ORC, orcDatasetUri); + ScanOptions options = new ScanOptions(100); + Schema schema = inferResultSchemaFromFactory(factory, options); + List datum = collectResultFromFactory(factory, options); + + assertSingleTaskProduced(factory, options); + assertEquals(1, datum.size()); + assertEquals(1, schema.getFields().size()); + assertEquals("ints", schema.getFields().get(0).getName()); + + String expectedJsonUnordered = "[[2147483647], [-2147483648]]"; + checkParquetReadResult(schema, expectedJsonUnordered, datum); + + AutoCloseables.close(datum); + AutoCloseables.close(factory); + } + private void checkParquetReadResult(Schema schema, String expectedJson, List actual) throws IOException { final ObjectMapper json = new ObjectMapper();