diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index 6fffc546eaf2e..aa7d767023242 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -94,6 +94,10 @@ arrow::Result> GetFileFormat( #ifdef ARROW_ORC case 2: return std::make_shared(); +#endif +#ifdef ARROW_CSV + case 3: + return std::make_shared(); #endif default: std::string error_message = diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java index b428b254b10ca..aad4fa5f2af48 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java @@ -24,6 +24,7 @@ public enum FileFormat { PARQUET(0), ARROW_IPC(1), ORC(2), + CSV(3), NONE(-1); private final int id; diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java b/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java new file mode 100644 index 0000000000000..954408ce25ecc --- /dev/null +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/CsvWriteSupport.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Random; + +public class CsvWriteSupport { + private final URI uri; + private final Random random = new Random(); + + public CsvWriteSupport(File outputFolder) throws URISyntaxException { + uri = new URI("file", outputFolder.getPath() + File.separator + "generated-" + random.nextLong() + ".csv", null); + } + + public static CsvWriteSupport writeTempFile(File outputFolder, String... values) + throws URISyntaxException, IOException { + CsvWriteSupport writer = new CsvWriteSupport(outputFolder); + try (FileWriter addValues = new FileWriter(new File(writer.uri), true)) { + for (Object value : values) { + addValues.write(value + "\n"); + } + } + return writer; + } + + public String getOutputURI() { + return uri.toString(); + } +} diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java index 9dc5f2b655a83..b8a13937a8aad 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java @@ -37,6 +37,7 @@ import java.util.concurrent.Executors; import java.util.stream.Collectors; +import org.apache.arrow.dataset.CsvWriteSupport; import org.apache.arrow.dataset.OrcWriteSupport; import org.apache.arrow.dataset.ParquetWriteSupport; import org.apache.arrow.dataset.jni.NativeDataset; @@ -361,6 +362,30 @@ public void testBaseOrcRead() throws Exception { AutoCloseables.close(factory); } + @Test + public void testBaseCsvRead() throws Exception { + CsvWriteSupport writeSupport = CsvWriteSupport.writeTempFile( + TMP.newFolder(), "Name,Language", "Juno,Java", "Peter,Python", "Celin,C++"); + String expectedJsonUnordered = "[[\"Juno\", \"Java\"], [\"Peter\", \"Python\"], [\"Celin\", \"C++\"]]"; + ScanOptions options = new ScanOptions(100); + try ( + FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(), + FileFormat.CSV, writeSupport.getOutputURI()) + ) { + List datum = collectResultFromFactory(factory, options); + Schema schema = inferResultSchemaFromFactory(factory, options); + + assertScanBatchesProduced(factory, options); + assertEquals(1, datum.size()); + assertEquals(2, schema.getFields().size()); + assertEquals("Name", schema.getFields().get(0).getName()); + + checkParquetReadResult(schema, expectedJsonUnordered, datum); + + AutoCloseables.close(datum); + } + } + private void checkParquetReadResult(Schema schema, String expectedJson, List actual) throws IOException { final ObjectMapper json = new ObjectMapper(); diff --git a/java/pom.xml b/java/pom.xml index 0438bb8fda88c..666700d36497e 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -916,7 +916,7 @@ generate-cdata-dylib_so - java-dist/lib + java-dist @@ -981,8 +981,18 @@ generate-jni-dylib_so - java-dist/lib + java-dist false + ON + ON + ON + OFF + OFF + OFF + ON + OFF + ON + OFF @@ -1015,16 +1025,16 @@ -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF - -DARROW_CSV=ON + -DARROW_CSV=${ARROW_CSV} -DARROW_DATASET=ON -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON - -DARROW_GANDIVA=ON + -DARROW_GANDIVA=${ARROW_GANDIVA} -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON - -DARROW_ORC=ON - -DARROW_PARQUET=ON - -DARROW_PLASMA=ON + -DARROW_ORC=${ARROW_ORC} + -DARROW_PARQUET=${ARROW_PARQUET} + -DARROW_PLASMA=${ARROW_PLASMA} -DARROW_S3=ON -DARROW_USE_CCACHE=ON -DCMAKE_BUILD_TYPE=Release @@ -1062,12 +1072,16 @@ -S java -B java-jni - -DARROW_JAVA_JNI_ENABLE_C=OFF + -DARROW_JAVA_JNI_ENABLE_C=${ARROW_JAVA_JNI_ENABLE_C} + -DARROW_JAVA_JNI_ENABLE_DATASET=${ARROW_JAVA_JNI_ENABLE_DATASET} + -DARROW_JAVA_JNI_ENABLE_GANDIVA=${ARROW_JAVA_JNI_ENABLE_GANDIVA} + -DARROW_JAVA_JNI_ENABLE_ORC=${ARROW_JAVA_JNI_ENABLE_ORC} + -DARROW_JAVA_JNI_ENABLE_PLASMA=${ARROW_JAVA_JNI_ENABLE_PLASMA} -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_LIBDIR=lib - -DCMAKE_INSTALL_PREFIX=${arrow.c.jni.dist.dir} + -DCMAKE_INSTALL_PREFIX=${arrow.dataset.jni.dist.dir} -DCMAKE_PREFIX_PATH=${project.basedir}/../java-dist ../