Skip to content

Commit

Permalink
ARROW-17786: [Java] Read CSV files using org.apache.arrow.dataset.jni…
Browse files Browse the repository at this point in the history
….NativeDatasetFactory (#14182)

Support CSV file format in java Dataset API

Authored-by: david dali susanibar arce <[email protected]>
Signed-off-by: David Li <[email protected]>
  • Loading branch information
davisusanibar authored Oct 11, 2022
1 parent fa3cf78 commit a39f219
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 9 deletions.
4 changes: 4 additions & 0 deletions java/dataset/src/main/cpp/jni_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ arrow::Result<std::shared_ptr<arrow::dataset::FileFormat>> GetFileFormat(
#ifdef ARROW_ORC
case 2:
return std::make_shared<arrow::dataset::OrcFileFormat>();
#endif
#ifdef ARROW_CSV
case 3:
return std::make_shared<arrow::dataset::CsvFileFormat>();
#endif
default:
std::string error_message =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public enum FileFormat {
PARQUET(0),
ARROW_IPC(1),
ORC(2),
CSV(3),
NONE(-1);

private final int id;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.dataset;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Random;

public class CsvWriteSupport {
private final URI uri;
private final Random random = new Random();

public CsvWriteSupport(File outputFolder) throws URISyntaxException {
uri = new URI("file", outputFolder.getPath() + File.separator + "generated-" + random.nextLong() + ".csv", null);
}

public static CsvWriteSupport writeTempFile(File outputFolder, String... values)
throws URISyntaxException, IOException {
CsvWriteSupport writer = new CsvWriteSupport(outputFolder);
try (FileWriter addValues = new FileWriter(new File(writer.uri), true)) {
for (Object value : values) {
addValues.write(value + "\n");
}
}
return writer;
}

public String getOutputURI() {
return uri.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import java.util.concurrent.Executors;
import java.util.stream.Collectors;

import org.apache.arrow.dataset.CsvWriteSupport;
import org.apache.arrow.dataset.OrcWriteSupport;
import org.apache.arrow.dataset.ParquetWriteSupport;
import org.apache.arrow.dataset.jni.NativeDataset;
Expand Down Expand Up @@ -361,6 +362,30 @@ public void testBaseOrcRead() throws Exception {
AutoCloseables.close(factory);
}

@Test
public void testBaseCsvRead() throws Exception {
CsvWriteSupport writeSupport = CsvWriteSupport.writeTempFile(
TMP.newFolder(), "Name,Language", "Juno,Java", "Peter,Python", "Celin,C++");
String expectedJsonUnordered = "[[\"Juno\", \"Java\"], [\"Peter\", \"Python\"], [\"Celin\", \"C++\"]]";
ScanOptions options = new ScanOptions(100);
try (
FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(),
FileFormat.CSV, writeSupport.getOutputURI())
) {
List<ArrowRecordBatch> datum = collectResultFromFactory(factory, options);
Schema schema = inferResultSchemaFromFactory(factory, options);

assertScanBatchesProduced(factory, options);
assertEquals(1, datum.size());
assertEquals(2, schema.getFields().size());
assertEquals("Name", schema.getFields().get(0).getName());

checkParquetReadResult(schema, expectedJsonUnordered, datum);

AutoCloseables.close(datum);
}
}

private void checkParquetReadResult(Schema schema, String expectedJson, List<ArrowRecordBatch> actual)
throws IOException {
final ObjectMapper json = new ObjectMapper();
Expand Down
32 changes: 23 additions & 9 deletions java/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,7 @@
<profile>
<id>generate-cdata-dylib_so</id>
<properties>
<arrow.c.jni.dist.dir>java-dist/lib</arrow.c.jni.dist.dir>
<arrow.c.jni.dist.dir>java-dist</arrow.c.jni.dist.dir>
</properties>
<build>
<plugins>
Expand Down Expand Up @@ -981,8 +981,18 @@
<profile>
<id>generate-jni-dylib_so</id>
<properties>
<arrow.dataset.jni.dist.dir>java-dist/lib</arrow.dataset.jni.dist.dir>
<arrow.dataset.jni.dist.dir>java-dist</arrow.dataset.jni.dist.dir>
<cpp.dependencies.builded>false</cpp.dependencies.builded>
<ARROW_CSV>ON</ARROW_CSV>
<ARROW_ORC>ON</ARROW_ORC>
<ARROW_PARQUET>ON</ARROW_PARQUET>
<ARROW_PLASMA>OFF</ARROW_PLASMA>
<ARROW_GANDIVA>OFF</ARROW_GANDIVA>
<ARROW_JAVA_JNI_ENABLE_C>OFF</ARROW_JAVA_JNI_ENABLE_C>
<ARROW_JAVA_JNI_ENABLE_DATASET>ON</ARROW_JAVA_JNI_ENABLE_DATASET>
<ARROW_JAVA_JNI_ENABLE_GANDIVA>OFF</ARROW_JAVA_JNI_ENABLE_GANDIVA>
<ARROW_JAVA_JNI_ENABLE_ORC>ON</ARROW_JAVA_JNI_ENABLE_ORC>
<ARROW_JAVA_JNI_ENABLE_PLASMA>OFF</ARROW_JAVA_JNI_ENABLE_PLASMA>
</properties>
<build>
<plugins>
Expand Down Expand Up @@ -1015,16 +1025,16 @@
-S cpp
-B cpp-jni
-DARROW_BUILD_SHARED=OFF
-DARROW_CSV=ON
-DARROW_CSV=${ARROW_CSV}
-DARROW_DATASET=ON
-DARROW_DEPENDENCY_SOURCE=BUNDLED
-DARROW_DEPENDENCY_USE_SHARED=OFF
-DARROW_FILESYSTEM=ON
-DARROW_GANDIVA=ON
-DARROW_GANDIVA=${ARROW_GANDIVA}
-DARROW_GANDIVA_STATIC_LIBSTDCPP=ON
-DARROW_ORC=ON
-DARROW_PARQUET=ON
-DARROW_PLASMA=ON
-DARROW_ORC=${ARROW_ORC}
-DARROW_PARQUET=${ARROW_PARQUET}
-DARROW_PLASMA=${ARROW_PLASMA}
-DARROW_S3=ON
-DARROW_USE_CCACHE=ON
-DCMAKE_BUILD_TYPE=Release
Expand Down Expand Up @@ -1062,12 +1072,16 @@
<commandlineArgs>
-S java
-B java-jni
-DARROW_JAVA_JNI_ENABLE_C=OFF
-DARROW_JAVA_JNI_ENABLE_C=${ARROW_JAVA_JNI_ENABLE_C}
-DARROW_JAVA_JNI_ENABLE_DATASET=${ARROW_JAVA_JNI_ENABLE_DATASET}
-DARROW_JAVA_JNI_ENABLE_GANDIVA=${ARROW_JAVA_JNI_ENABLE_GANDIVA}
-DARROW_JAVA_JNI_ENABLE_ORC=${ARROW_JAVA_JNI_ENABLE_ORC}
-DARROW_JAVA_JNI_ENABLE_PLASMA=${ARROW_JAVA_JNI_ENABLE_PLASMA}
-DARROW_JAVA_JNI_ENABLE_DEFAULT=ON
-DBUILD_TESTING=OFF
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_LIBDIR=lib
-DCMAKE_INSTALL_PREFIX=${arrow.c.jni.dist.dir}
-DCMAKE_INSTALL_PREFIX=${arrow.dataset.jni.dist.dir}
-DCMAKE_PREFIX_PATH=${project.basedir}/../java-dist
</commandlineArgs>
<workingDirectory>../</workingDirectory>
Expand Down

0 comments on commit a39f219

Please sign in to comment.