From 7c5484284f45c6a70b2532f6e87b9e99c5734159 Mon Sep 17 00:00:00 2001 From: James Duong Date: Thu, 12 Oct 2023 09:51:40 -0700 Subject: [PATCH] GH-37943: [Java] Add parquet file with all supported types Add a reference file with all supported types and corresponding test case to validate that the Dataset API generates this consistently. --- java/dataset/pom.xml | 7 + .../apache/arrow/dataset/TestAllTypes.java | 262 ++++++++++++++++++ .../org/apache/arrow/dataset/TestDataset.java | 41 +++ .../dataset/file/TestDatasetFileWriter.java | 41 --- .../apache/arrow/flight/FlightTestUtil.java | 17 +- .../apache/arrow/util/ArrowTestDataUtil.java | 43 +++ testing | 2 +- 7 files changed, 356 insertions(+), 57 deletions(-) create mode 100644 java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java create mode 100644 java/vector/src/test/java/org/apache/arrow/util/ArrowTestDataUtil.java diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index 055fa1cabd4d9..e971a918e4a9d 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -74,6 +74,13 @@ ${parquet.version} test + + org.apache.arrow + arrow-vector + ${project.version} + test + tests + org.apache.hadoop hadoop-common diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java new file mode 100644 index 0000000000000..d0163bec3674c --- /dev/null +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.nio.channels.Channels; +import java.nio.channels.SeekableByteChannel; +import java.nio.channels.WritableByteChannel; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +import org.apache.arrow.dataset.file.DatasetFileWriter; +import org.apache.arrow.dataset.file.FileFormat; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.ArrowTestDataUtil; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.LargeVarBinaryVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.impl.UnionFixedSizeListWriter; +import org.apache.arrow.vector.complex.impl.UnionLargeListWriter; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.types.DateUnit; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; +import org.apache.arrow.vector.util.Text; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class TestAllTypes extends TestDataset { + + @ClassRule + public static final TemporaryFolder TMP = new TemporaryFolder(); + + private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) { + // Notes: + // - Float16 is not supported by Java. + // - IntervalMonthDayNano is not supported by Parquet. + // - Map (GH-38250) and SparseUnion are resulting in serialization errors when writing with the Dataset API. + // "Unhandled type for Arrow to Parquet schema conversion" errors: IntervalDay, IntervalYear, DenseUnion + List childFields = new ArrayList<>(); + childFields.add(new Field("int-child", + new FieldType(false, new ArrowType.Int(32, true), null, null), null)); + Field structField = new Field("struct", + new FieldType(true, ArrowType.Struct.INSTANCE, null, null), childFields); + List fields = List.of( + Field.nullablePrimitive("null", ArrowType.Null.INSTANCE), + Field.nullablePrimitive("bool", ArrowType.Bool.INSTANCE), + Field.nullablePrimitive("int8", new ArrowType.Int(8, true)), + Field.nullablePrimitive("int16", new ArrowType.Int(16, true)), + Field.nullablePrimitive("int32", new ArrowType.Int(32, true)), + Field.nullablePrimitive("int64", new ArrowType.Int(64, true)), + Field.nullablePrimitive("uint8", new ArrowType.Int(8, false)), + Field.nullablePrimitive("uint16", new ArrowType.Int(16, false)), + Field.nullablePrimitive("uint32", new ArrowType.Int(32, false)), + Field.nullablePrimitive("uint64", new ArrowType.Int(64, false)), + Field.nullablePrimitive("float32", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + Field.nullablePrimitive("float64", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + Field.nullablePrimitive("utf8", ArrowType.Utf8.INSTANCE), + Field.nullablePrimitive("binary", ArrowType.Binary.INSTANCE), + Field.nullablePrimitive("largeutf8", ArrowType.LargeUtf8.INSTANCE), + Field.nullablePrimitive("largebinary", ArrowType.LargeBinary.INSTANCE), + Field.nullablePrimitive("fixed_size_binary", new ArrowType.FixedSizeBinary(1)), + Field.nullablePrimitive("date_ms", new ArrowType.Date(DateUnit.MILLISECOND)), + Field.nullablePrimitive("time_ms", new ArrowType.Time(TimeUnit.MILLISECOND, 32)), + Field.nullablePrimitive("timestamp_ms", new ArrowType.Timestamp(TimeUnit.MILLISECOND, null)), + Field.nullablePrimitive("timestamptz_ms", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")), + Field.nullablePrimitive("time_ns", new ArrowType.Time(TimeUnit.NANOSECOND, 64)), + Field.nullablePrimitive("timestamp_ns", new ArrowType.Timestamp(TimeUnit.NANOSECOND, null)), + Field.nullablePrimitive("timestamptz_ns", new ArrowType.Timestamp(TimeUnit.NANOSECOND, "UTC")), + Field.nullablePrimitive("duration", new ArrowType.Duration(TimeUnit.MILLISECOND)), + Field.nullablePrimitive("decimal128", new ArrowType.Decimal(10, 2, 128)), + Field.nullablePrimitive("decimal256", new ArrowType.Decimal(10, 2, 256)), + new Field("list", FieldType.nullable(ArrowType.List.INSTANCE), + Collections.singletonList(Field.nullable("items", new ArrowType.Int(32, true)))), + new Field("largelist", FieldType.nullable(ArrowType.LargeList.INSTANCE), + Collections.singletonList(Field.nullable("items", new ArrowType.Int(32, true)))), + new Field("fixedsizelist", FieldType.nullable(new ArrowType.FixedSizeList(2)), + Collections.singletonList(Field.nullable("items", new ArrowType.Int(32, true)))), + structField + ); + VectorSchemaRoot root = VectorSchemaRoot.create(new Schema(fields), allocator); + root.allocateNew(); + root.setRowCount(2); + + root.getVector("null").setNull(0); + root.getVector("bool").setNull(0); + root.getVector("int8").setNull(0); + root.getVector("int16").setNull(0); + root.getVector("int32").setNull(0); + root.getVector("int64").setNull(0); + root.getVector("uint8").setNull(0); + root.getVector("uint16").setNull(0); + root.getVector("uint32").setNull(0); + root.getVector("uint64").setNull(0); + root.getVector("float32").setNull(0); + root.getVector("float64").setNull(0); + root.getVector("utf8").setNull(0); + root.getVector("binary").setNull(0); + root.getVector("largeutf8").setNull(0); + root.getVector("largebinary").setNull(0); + root.getVector("fixed_size_binary").setNull(0); + root.getVector("date_ms").setNull(0); + root.getVector("time_ms").setNull(0); + root.getVector("time_ns").setNull(0); + root.getVector("timestamp_ms").setNull(0); + root.getVector("timestamp_ns").setNull(0); + root.getVector("timestamptz_ms").setNull(0); + root.getVector("timestamptz_ns").setNull(0); + root.getVector("duration").setNull(0); + root.getVector("decimal128").setNull(0); + root.getVector("decimal256").setNull(0); + root.getVector("fixedsizelist").setNull(0); + root.getVector("list").setNull(0); + root.getVector("largelist").setNull(0); + root.getVector("struct").setNull(0); + + root.getVector("null").setNull(1); + ((BitVector) root.getVector("bool")).set(1, 1); + ((TinyIntVector) root.getVector("int8")).set(1, 1); + ((SmallIntVector) root.getVector("int16")).set(1, 1); + ((IntVector) root.getVector("int32")).set(1, 1); + ((BigIntVector) root.getVector("int64")).set(1, 1); + ((UInt1Vector) root.getVector("uint8")).set(1, 1); + ((UInt2Vector) root.getVector("uint16")).set(1, 1); + ((UInt4Vector) root.getVector("uint32")).set(1, 1); + ((UInt8Vector) root.getVector("uint64")).set(1, 1); + ((Float4Vector) root.getVector("float32")).set(1, 1.0f); + ((Float8Vector) root.getVector("float64")).set(1, 1.0); + ((VarCharVector) root.getVector("utf8")).set(1, new Text("a")); + ((VarBinaryVector) root.getVector("binary")).set(1, new byte[] {0x01}); + ((LargeVarCharVector) root.getVector("largeutf8")).set(1, new Text("a")); + ((LargeVarBinaryVector) root.getVector("largebinary")).set(1, new byte[] {0x01}); + ((FixedSizeBinaryVector) root.getVector("fixed_size_binary")).set(1, new byte[] {0x01}); + ((DateMilliVector) root.getVector("date_ms")).set(1, 0); + ((TimeMilliVector) root.getVector("time_ms")).set(1, 0); + ((TimeNanoVector) root.getVector("time_ns")).set(1, 0); + ((TimeStampMilliVector) root.getVector("timestamp_ms")).set(1, 0); + ((TimeStampNanoVector) root.getVector("timestamp_ns")).set(1, 0); + ((TimeStampMilliTZVector) root.getVector("timestamptz_ms")).set(1, 0); + ((TimeStampNanoTZVector) root.getVector("timestamptz_ns")).set(1, 0); + ((DurationVector) root.getVector("duration")).set(1, 0); + ((DecimalVector) root.getVector("decimal128")).set(1, 0); + ((Decimal256Vector) root.getVector("decimal256")).set(1, 0); + UnionFixedSizeListWriter fixedListWriter = ((FixedSizeListVector) root.getVector("fixedsizelist")).getWriter(); + fixedListWriter.allocate(); + fixedListWriter.setPosition(1); + fixedListWriter.startList(); + fixedListWriter.integer().writeInt(1); + fixedListWriter.endList(); + + UnionListWriter listWriter = ((ListVector) root.getVector("list")).getWriter(); + listWriter.allocate(); + listWriter.setPosition(1); + listWriter.startList(); + listWriter.integer().writeInt(1); + listWriter.endList(); + + UnionLargeListWriter largeListWriter = ((LargeListVector) root.getVector("largelist")).getWriter(); + largeListWriter.allocate(); + largeListWriter.setPosition(1); + largeListWriter.startList(); + largeListWriter.integer().writeInt(1); + largeListWriter.endList(); + + ((StructVector) root.getVector("struct")).getChild("int-child", IntVector.class).set(1, 1); + return root; + } + + private byte[] serializeFile(VectorSchemaRoot root) { + try ( + ByteArrayOutputStream out = new ByteArrayOutputStream(); + WritableByteChannel channel = Channels.newChannel(out); + ArrowStreamWriter writer = new ArrowStreamWriter(root, null, channel) + ) { + writer.start(); + writer.writeBatch(); + writer.end(); + return out.toByteArray(); + } catch (IOException e) { + throw new IllegalArgumentException("Failed to serialize arrow file", e); + } + } + + @Test + public void testAllTypesParquet() throws Exception { + try (VectorSchemaRoot root = generateAllTypesVector(rootAllocator())) { + byte[] featherData = serializeFile(root); + try (SeekableByteChannel channel = new ByteArrayReadableSeekableByteChannel(featherData)) { + try (ArrowStreamReader reader = new ArrowStreamReader(channel, rootAllocator())) { + TMP.create(); + final File writtenFolder = TMP.newFolder(); + final String writtenParquet = writtenFolder.toURI().toString(); + DatasetFileWriter.write(rootAllocator(), reader, FileFormat.PARQUET, + writtenParquet); + + // Load the reference file from the test resources and write to a temporary file on the OS. + String referenceFile = ArrowTestDataUtil.getTestDataRoot() + .resolve("parquet") + .resolve("alltypes-java.parquet") + .toUri().toString(); + assertParquetFileEquals(referenceFile, + Objects.requireNonNull(writtenFolder.listFiles())[0].toURI().toString()); + } + } + } + } +} diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TestDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TestDataset.java index af2abeee2145f..cafa63b7880ce 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestDataset.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestDataset.java @@ -27,6 +27,9 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; +import org.apache.arrow.dataset.file.FileFormat; +import org.apache.arrow.dataset.file.FileSystemDatasetFactory; +import org.apache.arrow.dataset.jni.NativeMemoryPool; import org.apache.arrow.dataset.scanner.ScanOptions; import org.apache.arrow.dataset.scanner.Scanner; import org.apache.arrow.dataset.source.Dataset; @@ -34,12 +37,16 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.util.AutoCloseables; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VectorLoader; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.compare.VectorEqualsVisitor; import org.apache.arrow.vector.ipc.ArrowReader; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.After; +import org.junit.Assert; import org.junit.Before; @@ -100,6 +107,40 @@ protected Schema inferResultSchemaFromFactory(DatasetFactory factory, ScanOption return schema; } + protected void assertParquetFileEquals(String expectedURI, String actualURI) throws Exception { + final FileSystemDatasetFactory expectedFactory = new FileSystemDatasetFactory( + rootAllocator(), NativeMemoryPool.getDefault(), FileFormat.PARQUET, expectedURI); + final FileSystemDatasetFactory actualFactory = new FileSystemDatasetFactory( + rootAllocator(), NativeMemoryPool.getDefault(), FileFormat.PARQUET, actualURI); + List expectedBatches = collectResultFromFactory(expectedFactory, + new ScanOptions(new String[0], 100)); + List actualBatches = collectResultFromFactory(actualFactory, + new ScanOptions(new String[0], 100)); + try ( + VectorSchemaRoot expectVsr = VectorSchemaRoot.create(expectedFactory.inspect(), rootAllocator()); + VectorSchemaRoot actualVsr = VectorSchemaRoot.create(actualFactory.inspect(), rootAllocator())) { + + // fast-fail by comparing metadata + Assert.assertEquals(expectedBatches.toString(), actualBatches.toString()); + // compare ArrowRecordBatches + Assert.assertEquals(expectedBatches.size(), actualBatches.size()); + VectorLoader expectLoader = new VectorLoader(expectVsr); + VectorLoader actualLoader = new VectorLoader(actualVsr); + for (int i = 0; i < expectedBatches.size(); i++) { + expectLoader.load(expectedBatches.get(i)); + actualLoader.load(actualBatches.get(i)); + for (int j = 0; j < expectVsr.getFieldVectors().size(); j++) { + FieldVector vector = expectVsr.getFieldVectors().get(i); + FieldVector otherVector = actualVsr.getFieldVectors().get(i); + // TODO: ARROW-18140 Use VectorSchemaRoot#equals() method to compare + Assert.assertTrue(VectorEqualsVisitor.vectorEquals(vector, otherVector)); + } + } + } finally { + AutoCloseables.close(expectedBatches, actualBatches); + } + } + protected Stream stream(Iterable iterable) { return StreamSupport.stream(iterable.spliterator(), false); } diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestDatasetFileWriter.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestDatasetFileWriter.java index 10c06be2cca3b..86797c165b2b1 100644 --- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestDatasetFileWriter.java +++ b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestDatasetFileWriter.java @@ -20,7 +20,6 @@ import java.io.File; import java.util.Arrays; import java.util.HashSet; -import java.util.List; import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; @@ -32,12 +31,6 @@ import org.apache.arrow.dataset.scanner.ScanOptions; import org.apache.arrow.dataset.scanner.Scanner; import org.apache.arrow.dataset.source.Dataset; -import org.apache.arrow.util.AutoCloseables; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.VectorLoader; -import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.compare.VectorEqualsVisitor; -import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.commons.io.FileUtils; import org.junit.Assert; import org.junit.ClassRule; @@ -99,39 +92,5 @@ public void testParquetWriteWithPartitions() throws Exception { Assert.assertEquals(expectedOutputFiles, outputFiles); } } - - private void assertParquetFileEquals(String expectedURI, String actualURI) throws Exception { - final FileSystemDatasetFactory expectedFactory = new FileSystemDatasetFactory( - rootAllocator(), NativeMemoryPool.getDefault(), FileFormat.PARQUET, expectedURI); - final FileSystemDatasetFactory actualFactory = new FileSystemDatasetFactory( - rootAllocator(), NativeMemoryPool.getDefault(), FileFormat.PARQUET, actualURI); - List expectedBatches = collectResultFromFactory(expectedFactory, - new ScanOptions(new String[0], 100)); - List actualBatches = collectResultFromFactory(actualFactory, - new ScanOptions(new String[0], 100)); - try ( - VectorSchemaRoot expectVsr = VectorSchemaRoot.create(expectedFactory.inspect(), rootAllocator()); - VectorSchemaRoot actualVsr = VectorSchemaRoot.create(actualFactory.inspect(), rootAllocator())) { - - // fast-fail by comparing metadata - Assert.assertEquals(expectedBatches.toString(), actualBatches.toString()); - // compare ArrowRecordBatches - Assert.assertEquals(expectedBatches.size(), actualBatches.size()); - VectorLoader expectLoader = new VectorLoader(expectVsr); - VectorLoader actualLoader = new VectorLoader(actualVsr); - for (int i = 0; i < expectedBatches.size(); i++) { - expectLoader.load(expectedBatches.get(i)); - actualLoader.load(actualBatches.get(i)); - for (int j = 0; j < expectVsr.getFieldVectors().size(); j++) { - FieldVector vector = expectVsr.getFieldVectors().get(i); - FieldVector otherVector = actualVsr.getFieldVectors().get(i); - // TODO: ARROW-18140 Use VectorSchemaRoot#equals() method to compare - Assert.assertTrue(VectorEqualsVisitor.vectorEquals(vector, otherVector)); - } - } - } finally { - AutoCloseables.close(expectedBatches, actualBatches); - } - } } diff --git a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/FlightTestUtil.java b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/FlightTestUtil.java index cbb714a967bdd..25d59d99ad7cd 100644 --- a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/FlightTestUtil.java +++ b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/FlightTestUtil.java @@ -20,12 +20,11 @@ import java.io.File; import java.lang.reflect.InvocationTargetException; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.Arrays; import java.util.List; -import java.util.Objects; import java.util.Random; +import org.apache.arrow.util.ArrowTestDataUtil; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.function.Executable; @@ -37,21 +36,9 @@ public class FlightTestUtil { private static final Random RANDOM = new Random(); public static final String LOCALHOST = "localhost"; - public static final String TEST_DATA_ENV_VAR = "ARROW_TEST_DATA"; - public static final String TEST_DATA_PROPERTY = "arrow.test.dataRoot"; - - static Path getTestDataRoot() { - String path = System.getenv(TEST_DATA_ENV_VAR); - if (path == null) { - path = System.getProperty(TEST_DATA_PROPERTY); - } - return Paths.get(Objects.requireNonNull(path, - String.format("Could not find test data path. Set the environment variable %s or the JVM property %s.", - TEST_DATA_ENV_VAR, TEST_DATA_PROPERTY))); - } static Path getFlightTestDataRoot() { - return getTestDataRoot().resolve("flight"); + return ArrowTestDataUtil.getTestDataRoot().resolve("flight"); } static Path exampleTlsRootCert() { diff --git a/java/vector/src/test/java/org/apache/arrow/util/ArrowTestDataUtil.java b/java/vector/src/test/java/org/apache/arrow/util/ArrowTestDataUtil.java new file mode 100644 index 0000000000000..120c0adc884ed --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/util/ArrowTestDataUtil.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.util; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Objects; + +/** + * Utility methods and constants for working with the arrow-testing repo. + */ +public final class ArrowTestDataUtil { + public static final String TEST_DATA_ENV_VAR = "ARROW_TEST_DATA"; + public static final String TEST_DATA_PROPERTY = "arrow.test.dataRoot"; + + public static Path getTestDataRoot() { + String path = System.getenv(TEST_DATA_ENV_VAR); + if (path == null) { + path = System.getProperty(TEST_DATA_PROPERTY); + } + return Paths.get(Objects.requireNonNull(path, + String.format("Could not find test data path. Set the environment variable %s or the JVM property %s.", + TEST_DATA_ENV_VAR, TEST_DATA_PROPERTY))); + } + + private ArrowTestDataUtil() { + } +} diff --git a/testing b/testing index 47f7b56b25683..ad82a736c170e 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 47f7b56b25683202c1fd957668e13f2abafc0f12 +Subproject commit ad82a736c170e97b7c8c035ebd8a801c17eec170