From c5173186bbd61641ba05e2848ad860fb696f6095 Mon Sep 17 00:00:00 2001 From: George Godik Date: Mon, 1 Aug 2022 16:42:36 -0400 Subject: [PATCH 01/11] ARROW-17274: [GO] Remove panic from parquet.file.RowGroupReader.Column(index int) (#13767) Remove panic from `parquet.file.RowGroupReader.Column(index int)` - parquet.file.RowGroupReader.Column(index int) panics if the the provided column index is invalid. - Return an error as the rest of the functions in the codebase return an error as well. For example GetColumnPageReader returns (PageReader, error) - fixed usage in tests https://github.com/apache/arrow/blob/master/go/parquet/file/row_group_reader.go#L58 Authored-by: ggodik Signed-off-by: Matt Topol --- go/parquet/cmd/parquet_reader/main.go | 24 ++++++++++++---- go/parquet/encryption_read_config_test.go | 35 ++++++++++++++++++----- go/parquet/file/file_writer_test.go | 11 +++++-- go/parquet/file/row_group_reader.go | 8 +++--- go/parquet/pqarrow/encode_arrow_test.go | 12 +++++--- 5 files changed, 66 insertions(+), 24 deletions(-) diff --git a/go/parquet/cmd/parquet_reader/main.go b/go/parquet/cmd/parquet_reader/main.go index d86177a5e67f5..629db7fb0dfdc 100644 --- a/go/parquet/cmd/parquet_reader/main.go +++ b/go/parquet/cmd/parquet_reader/main.go @@ -230,8 +230,12 @@ func main() { scanners := make([]*Dumper, len(selectedColumns)) fields := make([]string, len(selectedColumns)) for idx, c := range selectedColumns { - scanners[idx] = createDumper(rgr.Column(c)) - fields[idx] = rgr.Column(c).Descriptor().Path() + col, err := rgr.Column(c) + if err != nil { + log.Fatalf("unable to fetch column=%d err=%s", c, err) + } + scanners[idx] = createDumper(col) + fields[idx] = col.Descriptor().Path() } var line string @@ -283,8 +287,12 @@ func main() { if idx > 0 { fmt.Fprint(dataOut, ",") } - scanners[idx] = createDumper(rgr.Column(c)) - fmt.Fprintf(dataOut, "%q", rgr.Column(c).Descriptor().Path()) + col, err := rgr.Column(c) + if err != nil { + log.Fatalf("unable to fetch col=%d err=%s", c, err) + } + scanners[idx] = createDumper(col) + fmt.Fprintf(dataOut, "%q", col.Descriptor().Path()) } fmt.Fprintln(dataOut) @@ -334,8 +342,12 @@ func main() { scanners := make([]*Dumper, len(selectedColumns)) for idx, c := range selectedColumns { - scanners[idx] = createDumper(rgr.Column(c)) - fmt.Fprintf(dataOut, fmt.Sprintf("%%-%ds|", colwidth), rgr.Column(c).Descriptor().Name()) + col, err := rgr.Column(c) + if err != nil { + log.Fatalf("unable to fetch column=%d err=%s", c, err) + } + scanners[idx] = createDumper(col) + fmt.Fprintf(dataOut, fmt.Sprintf("%%-%ds|", colwidth), col.Descriptor().Name()) } fmt.Fprintln(dataOut) diff --git a/go/parquet/encryption_read_config_test.go b/go/parquet/encryption_read_config_test.go index f3427d15cd22b..d16a5b3b8914b 100644 --- a/go/parquet/encryption_read_config_test.go +++ b/go/parquet/encryption_read_config_test.go @@ -185,7 +185,10 @@ func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) rowsRead := int64(0) // get col reader for boolean column - colReader := rowGroupReader.Column(0) + colReader, err := rowGroupReader.Column(0) + if err != nil { + panic(err) + } boolReader := colReader.(*file.BooleanColumnChunkReader) // get column chunk metadata for boolean column @@ -210,7 +213,10 @@ func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) d.EqualValues(i, boolMd.NumValues()) // Get column reader for int32 column - colReader = rowGroupReader.Column(1) + colReader, err = rowGroupReader.Column(1) + if err != nil { + panic(err) + } int32reader := colReader.(*file.Int32ColumnChunkReader) int32md, _ := rgMeta.ColumnChunk(1) @@ -232,7 +238,10 @@ func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) d.EqualValues(i, int32md.NumValues()) // Get column reader for int64 column - colReader = rowGroupReader.Column(2) + colReader, err = rowGroupReader.Column(2) + if err != nil { + panic(err) + } int64reader := colReader.(*file.Int64ColumnChunkReader) int64md, _ := rgMeta.ColumnChunk(2) @@ -265,7 +274,10 @@ func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) d.EqualValues(i, int64md.NumValues()) // Get column reader for int96 column - colReader = rowGroupReader.Column(3) + colReader, err = rowGroupReader.Column(3) + if err != nil { + panic(err) + } int96reader := colReader.(*file.Int96ColumnChunkReader) int96md, _ := rgMeta.ColumnChunk(3) @@ -297,7 +309,10 @@ func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) // try to read them during the plaintext test. if props.FileDecryptProps != nil { // Get column reader for the float column - colReader = rowGroupReader.Column(4) + colReader, err = rowGroupReader.Column(4) + if err != nil { + panic(err) + } floatReader := colReader.(*file.Float32ColumnChunkReader) floatmd, _ := rgMeta.ColumnChunk(4) @@ -320,7 +335,10 @@ func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) d.EqualValues(i, floatmd.NumValues()) // Get column reader for the double column - colReader = rowGroupReader.Column(5) + colReader, err = rowGroupReader.Column(5) + if err != nil { + panic(err) + } dblReader := colReader.(*file.Float64ColumnChunkReader) dblmd, _ := rgMeta.ColumnChunk(5) @@ -343,7 +361,10 @@ func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) d.EqualValues(i, dblmd.NumValues()) } - colReader = rowGroupReader.Column(6) + colReader, err = rowGroupReader.Column(6) + if err != nil { + panic(err) + } bareader := colReader.(*file.ByteArrayColumnChunkReader) bamd, _ := rgMeta.ColumnChunk(6) diff --git a/go/parquet/file/file_writer_test.go b/go/parquet/file/file_writer_test.go index e7d9deca6e141..fd43779ce812b 100644 --- a/go/parquet/file/file_writer_test.go +++ b/go/parquet/file/file_writer_test.go @@ -115,7 +115,8 @@ func (t *SerializeTestSuite) fileSerializeTest(codec compress.Compression, expec t.False(chunk.HasIndexPage()) t.DefLevelsOut = make([]int16, t.rowsPerRG) t.RepLevelsOut = make([]int16, t.rowsPerRG) - colReader := rgr.Column(i) + colReader, err := rgr.Column(i) + t.NoError(err) t.SetupValuesOut(int64(t.rowsPerRG)) valuesRead = t.ReadBatch(colReader, int64(t.rowsPerRG), 0, t.DefLevelsOut, t.RepLevelsOut) t.EqualValues(t.rowsPerRG, valuesRead) @@ -310,7 +311,9 @@ func TestBufferedMultiPageDisabledDictionary(t *testing.T) { assert.EqualValues(t, valueCount, rgr.NumRows()) var totalRead int64 - colReader := rgr.Column(0).(*file.Int32ColumnChunkReader) + col, err := rgr.Column(0) + assert.NoError(t, err) + colReader := col.(*file.Int32ColumnChunkReader) for colReader.HasNext() { total, _, _ := colReader.ReadBatch(valueCount-totalRead, valuesOut[totalRead:], nil, nil) totalRead += total @@ -350,7 +353,9 @@ func TestAllNulls(t *testing.T) { assert.NoError(t, err) rgr := reader.RowGroup(0) - cr := rgr.Column(0).(*file.Int32ColumnChunkReader) + col, err := rgr.Column(0) + assert.NoError(t, err) + cr := col.(*file.Int32ColumnChunkReader) defLevels[0] = -1 defLevels[1] = -1 diff --git a/go/parquet/file/row_group_reader.go b/go/parquet/file/row_group_reader.go index 5d383dd28c8f5..87ed4a01834e0 100644 --- a/go/parquet/file/row_group_reader.go +++ b/go/parquet/file/row_group_reader.go @@ -55,17 +55,17 @@ func (r *RowGroupReader) ByteSize() int64 { return r.rgMetadata.TotalByteSize() // Column returns a column reader for the requested (0-indexed) column // // panics if passed a column not in the range [0, NumColumns) -func (r *RowGroupReader) Column(i int) ColumnChunkReader { +func (r *RowGroupReader) Column(i int) (ColumnChunkReader, error) { if i >= r.NumColumns() || i < 0 { - panic(fmt.Errorf("parquet: trying to read column index %d but row group metadata only has %d columns", i, r.rgMetadata.NumColumns())) + return nil, fmt.Errorf("parquet: trying to read column index %d but row group metadata only has %d columns", i, r.rgMetadata.NumColumns()) } descr := r.fileMetadata.Schema.Column(i) pageRdr, err := r.GetColumnPageReader(i) if err != nil { - panic(fmt.Errorf("parquet: unable to initialize page reader: %w", err)) + return nil, fmt.Errorf("parquet: unable to initialize page reader: %w", err) } - return NewColumnReader(descr, pageRdr, r.props.Allocator()) + return NewColumnReader(descr, pageRdr, r.props.Allocator()), nil } func (r *RowGroupReader) GetColumnPageReader(i int) (PageReader, error) { diff --git a/go/parquet/pqarrow/encode_arrow_test.go b/go/parquet/pqarrow/encode_arrow_test.go index 7f799e2338749..9263b9aae69ba 100644 --- a/go/parquet/pqarrow/encode_arrow_test.go +++ b/go/parquet/pqarrow/encode_arrow_test.go @@ -165,13 +165,14 @@ func TestWriteArrowCols(t *testing.T) { var ( total int64 read int - err error defLevelsOut = make([]int16, int(expected.NumRows())) arr = expected.Column(i).Data().Chunk(0) ) switch expected.Schema().Field(i).Type.(arrow.FixedWidthDataType).BitWidth() { case 32: - colReader := rgr.Column(i).(*file.Int32ColumnChunkReader) + col, err := rgr.Column(i) + assert.NoError(t, err) + colReader := col.(*file.Int32ColumnChunkReader) vals := make([]int32, int(expected.NumRows())) total, read, err = colReader.ReadBatch(expected.NumRows(), vals, defLevelsOut, nil) require.NoError(t, err) @@ -191,7 +192,9 @@ func TestWriteArrowCols(t *testing.T) { } } case 64: - colReader := rgr.Column(i).(*file.Int64ColumnChunkReader) + col, err := rgr.Column(i) + assert.NoError(t, err) + colReader := col.(*file.Int64ColumnChunkReader) vals := make([]int64, int(expected.NumRows())) total, read, err = colReader.ReadBatch(expected.NumRows(), vals, defLevelsOut, nil) require.NoError(t, err) @@ -258,7 +261,8 @@ func TestWriteArrowInt96(t *testing.T) { assert.EqualValues(t, 1, reader.NumRowGroups()) rgr := reader.RowGroup(0) - tsRdr := rgr.Column(3) + tsRdr, err := rgr.Column(3) + assert.NoError(t, err) assert.Equal(t, parquet.Types.Int96, tsRdr.Type()) rdr := tsRdr.(*file.Int96ColumnChunkReader) From e80981ca2a7b8c43f61f41b55842423b8295d633 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 1 Aug 2022 17:30:42 -0400 Subject: [PATCH 02/11] ARROW-17272: [Dev] Pass --add-opens in integration tests (#13765) This affects Java 16+ due to JEP 396 which now forces us to pass `--add-opens`. Detect the presence of Java >8 and pass the flag in the integration runner. Authored-by: David Li Signed-off-by: Sutou Kouhei --- .../archery/integration/tester_java.py | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py index dc4550d997eb5..4c85a3a30b18d 100644 --- a/dev/archery/archery/integration/tester_java.py +++ b/dev/archery/archery/integration/tester_java.py @@ -55,10 +55,10 @@ def load_version_from_pom(): ), ) _ARROW_FLIGHT_SERVER = ( - "org.apache.arrow.flight.integration.tests." "IntegrationTestServer" + "org.apache.arrow.flight.integration.tests.IntegrationTestServer" ) _ARROW_FLIGHT_CLIENT = ( - "org.apache.arrow.flight.integration.tests." "IntegrationTestClient" + "org.apache.arrow.flight.integration.tests.IntegrationTestClient" ) @@ -70,10 +70,24 @@ class JavaTester(Tester): name = 'Java' + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Detect whether we're on Java 8 or Java 9+ + self._java_opts = _JAVA_OPTS[:] + proc = subprocess.run( + ['java', '--add-opens'], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True) + if 'Unrecognized option: --add-opens' not in proc.stderr: + # Java 9+ + self._java_opts.append( + '--add-opens=java.base/java.nio=ALL-UNNAMED') + def _run(self, arrow_path=None, json_path=None, command='VALIDATE'): cmd = ( ['java'] + - _JAVA_OPTS + + self._java_opts + ['-cp', _ARROW_TOOLS_JAR, 'org.apache.arrow.tools.Integration'] ) @@ -98,7 +112,7 @@ def json_to_file(self, json_path, arrow_path): def stream_to_file(self, stream_path, file_path): cmd = ( - ['java'] + _JAVA_OPTS + [ + ['java'] + self._java_opts + [ '-cp', _ARROW_TOOLS_JAR, 'org.apache.arrow.tools.StreamToFile', @@ -112,7 +126,7 @@ def stream_to_file(self, stream_path, file_path): def file_to_stream(self, file_path, stream_path): cmd = ( - ['java'] + _JAVA_OPTS + [ + ['java'] + self._java_opts + [ '-cp', _ARROW_TOOLS_JAR, 'org.apache.arrow.tools.FileToStream', @@ -126,9 +140,10 @@ def file_to_stream(self, file_path, stream_path): def flight_request(self, port, json_path=None, scenario_name=None): cmd = ( - ['java'] + _JAVA_OPTS + ['-cp', _ARROW_FLIGHT_JAR, - _ARROW_FLIGHT_CLIENT, '-port', str(port)] - ) + ['java'] + self._java_opts + [ + '-cp', _ARROW_FLIGHT_JAR, _ARROW_FLIGHT_CLIENT, '-port', str( + port) + ]) if json_path: cmd.extend(('-j', json_path)) @@ -145,7 +160,7 @@ def flight_request(self, port, json_path=None, scenario_name=None): def flight_server(self, scenario_name=None): cmd = ( ['java'] + - _JAVA_OPTS + + self._java_opts + ['-cp', _ARROW_FLIGHT_JAR, _ARROW_FLIGHT_SERVER, '-port', '0'] ) if scenario_name: From 48e27804296233a9ab90b6096e291046ff2db6f3 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 1 Aug 2022 20:42:58 -0400 Subject: [PATCH 03/11] ARROW-17270: [Docs] Move nightly package instructions to dev docs (#13766) ASF policies state we shouldn't direct people to unofficial/developer/nightly releases. Move the instructions to the developer documentation to make this clearer. Authored-by: David Li Signed-off-by: Sutou Kouhei --- docs/source/developers/java/building.rst | 157 +++++++++++++++++++++++ docs/source/developers/python.rst | 29 ++++- docs/source/java/install.rst | 155 ---------------------- docs/source/python/install.rst | 27 ---- 4 files changed, 185 insertions(+), 183 deletions(-) diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index 2824649253f11..add2b11b27807 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -220,3 +220,160 @@ Common Errors .. _Archery: https://github.com/apache/arrow/blob/master/dev/archery/README.md .. _Dependency Resolution: https://arrow.apache.org/docs/developers/cpp/building.html#individual-dependency-resolution .. _C++ shared libraries: https://arrow.apache.org/docs/cpp/build_system.html + + +Installing Nightly Packages +=========================== + +.. warning:: + These packages are not official releases. Use them at your own risk. + +Arrow nightly builds are posted on the mailing list at `builds@arrow.apache.org`_. +The artifacts are uploaded to GitHub. For example, for 2022/07/30, they can be found at `Github Nightly`_. + + +Installing from Apache Nightlies +-------------------------------- +1. Look up the nightly version number for the Arrow libraries used. + + For example, for ``arrow-memory``, visit https://nightlies.apache.org/arrow/java/org/apache/arrow/arrow-memory/ and see what versions are available (e.g. 9.0.0.dev501). +2. Add Apache Nightlies Repository to the Maven/Gradle project. + +.. code-block:: xml + + + 9.0.0.dev501 + + ... + + + arrow-apache-nightlies + https://nightlies.apache.org/arrow/java + + + ... + + + org.apache.arrow + arrow-vector + ${arrow.version} + + + ... + +Installing Manually +------------------- + +1. Decide nightly packages repository to use, for example: https://github.com/ursacomputing/crossbow/releases/tag/nightly-packaging-2022-07-30-0-github-java-jars +2. Add packages to your pom.xml, for example: flight-core (it depends on: arrow-format, arrow-vector, arrow-memeory-core and arrow-memory-netty). + +.. code-block:: xml + + + 8 + 8 + 9.0.0.dev501 + + + + + org.apache.arrow + flight-core + ${arrow.version} + + + +3. Download the necessary pom and jar files to a temporary directory: + +.. code-block:: shell + + $ mkdir nightly-packaging-2022-07-30-0-github-java-jars + $ cd nightly-packaging-2022-07-30-0-github-java-jars + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-java-root-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-format-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-format-9.0.0.dev501.jar + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-vector-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-vector-9.0.0.dev501.jar + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-core-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-netty-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-core-9.0.0.dev501.jar + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-netty-9.0.0.dev501.jar + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-flight-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/flight-core-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/flight-core-9.0.0.dev501.jar + $ tree + . + ├── arrow-flight-9.0.0.dev501.pom + ├── arrow-format-9.0.0.dev501.jar + ├── arrow-format-9.0.0.dev501.pom + ├── arrow-java-root-9.0.0.dev501.pom + ├── arrow-memory-9.0.0.dev501.pom + ├── arrow-memory-core-9.0.0.dev501.jar + ├── arrow-memory-core-9.0.0.dev501.pom + ├── arrow-memory-netty-9.0.0.dev501.jar + ├── arrow-memory-netty-9.0.0.dev501.pom + ├── arrow-vector-9.0.0.dev501.jar + ├── arrow-vector-9.0.0.dev501.pom + ├── flight-core-9.0.0.dev501.jar + └── flight-core-9.0.0.dev501.pom + +4. Install the artifacts to the local Maven repository with ``mvn install:install-file``: + +.. code-block:: shell + + $ mvn install:install-file -Dfile="$(pwd)/arrow-java-root-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-java-root -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-format-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-format -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-format-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-format -Dversion=9.0.0.dev501 -Dpackaging=jar + $ mvn install:install-file -Dfile="$(pwd)/arrow-vector-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-vector -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-vector-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-vector -Dversion=9.0.0.dev501 -Dpackaging=jar + $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-memory -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-core-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-core -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-netty-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-netty -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-core-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-core -Dversion=9.0.0.dev501 -Dpackaging=jar + $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-netty-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-netty -Dversion=9.0.0.dev501 -Dpackaging=jar + $ mvn install:install-file -Dfile="$(pwd)/arrow-flight-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-flight -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/flight-core-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=flight-core -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/flight-core-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=flight-core -Dversion=9.0.0.dev501 -Dpackaging=jar + +5. Validate that the packages were installed: + +.. code-block:: shell + + $ tree ~/.m2/repository/org/apache/arrow + . + ├── arrow-flight + │   ├── 9.0.0.dev501 + │   │   └── arrow-flight-9.0.0.dev501.pom + ├── arrow-format + │   ├── 9.0.0.dev501 + │   │   ├── arrow-format-9.0.0.dev501.jar + │   │   └── arrow-format-9.0.0.dev501.pom + ├── arrow-java-root + │   ├── 9.0.0.dev501 + │   │   └── arrow-java-root-9.0.0.dev501.pom + ├── arrow-memory + │   ├── 9.0.0.dev501 + │   │   └── arrow-memory-9.0.0.dev501.pom + ├── arrow-memory-core + │   ├── 9.0.0.dev501 + │   │   ├── arrow-memory-core-9.0.0.dev501.jar + │   │   └── arrow-memory-core-9.0.0.dev501.pom + ├── arrow-memory-netty + │   ├── 9.0.0.dev501 + │   │   ├── arrow-memory-netty-9.0.0.dev501.jar + │   │   └── arrow-memory-netty-9.0.0.dev501.pom + ├── arrow-vector + │   ├── 9.0.0.dev501 + │   │   ├── _remote.repositories + │   │   ├── arrow-vector-9.0.0.dev501.jar + │   │   └── arrow-vector-9.0.0.dev501.pom + └── flight-core + ├── 9.0.0.dev501 + │   ├── flight-core-9.0.0.dev501.jar + │   └── flight-core-9.0.0.dev501.pom + +6. Compile your project like usual with ``mvn clean install``. + +.. _builds@arrow.apache.org: https://lists.apache.org/list.html?builds@arrow.apache.org +.. _Github Nightly: https://github.com/ursacomputing/crossbow/releases/tag/nightly-packaging-2022-07-30-0-github-java-jars diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst index 0cce2f83f5925..98ed93968a945 100644 --- a/docs/source/developers/python.rst +++ b/docs/source/developers/python.rst @@ -388,7 +388,7 @@ Similarly, if you built with ``PARQUET_REQUIRE_ENCRYPTION`` (in C++), you need to set the corresponding ``PYARROW_WITH_PARQUET_ENCRYPTION`` environment variable to 1. -To set the number of threads used to compile PyArrow's C++/Cython components, +To set the number of threads used to compile PyArrow's C++/Cython components, set the ``PYARROW_PARALLEL`` environment variable. If you wish to delete stale PyArrow build artifacts before rebuilding, navigate @@ -604,3 +604,30 @@ Caveats ------- The Plasma component is not supported on Windows. + +Installing Nightly Packages +=========================== + +.. warning:: + These packages are not official releases. Use them at your own risk. + +PyArrow has nightly wheels and Conda packages for testing purposes. + +These may be suitable for downstream libraries in their continuous integration +setup to maintain compatibility with the upcoming PyArrow features, +deprecations and/or feature removals. + +Install the development version of PyArrow from `arrow-nightlies +`_ conda channel: + +.. code-block:: bash + + conda install -c arrow-nightlies pyarrow + +Install the development version from an `alternative PyPI +`_ index: + +.. code-block:: bash + + pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ \ + --prefer-binary --pre pyarrow diff --git a/docs/source/java/install.rst b/docs/source/java/install.rst index 206aaad98ade1..9eaf2b5883415 100644 --- a/docs/source/java/install.rst +++ b/docs/source/java/install.rst @@ -134,158 +134,3 @@ Installing from Source ---------------------- See :ref:`java-development`. - -Installing Nightly Packages ---------------------------- - -.. warning:: - These packages are not official releases. Use them at your own risk. - -Arrow nightly builds are posted on the mailing list at `builds@arrow.apache.org`_. -The artifacts are uploaded to GitHub. For example, for 2022/03/01, they can be found at `Github Nightly`_. - -Installing from Apache Nightlies -******************************** -1. Look up the nightly version number for the Arrow libraries used. - - For example, for ``arrow-memory``, visit https://nightlies.apache.org/arrow/java/org/apache/arrow/arrow-memory/ and see what versions are available (e.g. 9.0.0.dev501). -2. Add Apache Nightlies Repository to the Maven/Gradle project. - -.. code-block:: xml - - - 9.0.0.dev501 - - ... - - - arrow-apache-nightlies - https://nightlies.apache.org/arrow/java - - - ... - - - org.apache.arrow - arrow-vector - ${arrow.version} - - - ... - -Installing Manually -******************* - -1. Decide nightly packages repository to use, for example: https://github.com/ursacomputing/crossbow/releases/tag/nightly-packaging-2022-07-30-0-github-java-jars -2. Add packages to your pom.xml, for example: flight-core (it depends on: arrow-format, arrow-vector, arrow-memeory-core and arrow-memory-netty). - -.. code-block:: xml - - - 8 - 8 - 9.0.0.dev501 - - - - - org.apache.arrow - flight-core - ${arrow.version} - - - -3. Download the necessary pom and jar files to a temporary directory: - -.. code-block:: shell - - $ mkdir nightly-packaging-2022-07-30-0-github-java-jars - $ cd nightly-packaging-2022-07-30-0-github-java-jars - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-java-root-9.0.0.dev501.pom - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-format-9.0.0.dev501.pom - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-format-9.0.0.dev501.jar - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-vector-9.0.0.dev501.pom - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-vector-9.0.0.dev501.jar - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-9.0.0.dev501.pom - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-core-9.0.0.dev501.pom - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-netty-9.0.0.dev501.pom - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-core-9.0.0.dev501.jar - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-netty-9.0.0.dev501.jar - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-flight-9.0.0.dev501.pom - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/flight-core-9.0.0.dev501.pom - $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/flight-core-9.0.0.dev501.jar - $ tree - . - ├── arrow-flight-9.0.0.dev501.pom - ├── arrow-format-9.0.0.dev501.jar - ├── arrow-format-9.0.0.dev501.pom - ├── arrow-java-root-9.0.0.dev501.pom - ├── arrow-memory-9.0.0.dev501.pom - ├── arrow-memory-core-9.0.0.dev501.jar - ├── arrow-memory-core-9.0.0.dev501.pom - ├── arrow-memory-netty-9.0.0.dev501.jar - ├── arrow-memory-netty-9.0.0.dev501.pom - ├── arrow-vector-9.0.0.dev501.jar - ├── arrow-vector-9.0.0.dev501.pom - ├── flight-core-9.0.0.dev501.jar - └── flight-core-9.0.0.dev501.pom - -4. Install the artifacts to the local Maven repository with ``mvn install:install-file``: - -.. code-block:: shell - - $ mvn install:install-file -Dfile="$(pwd)/arrow-java-root-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-java-root -Dversion=9.0.0.dev501 -Dpackaging=pom - $ mvn install:install-file -Dfile="$(pwd)/arrow-format-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-format -Dversion=9.0.0.dev501 -Dpackaging=pom - $ mvn install:install-file -Dfile="$(pwd)/arrow-format-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-format -Dversion=9.0.0.dev501 -Dpackaging=jar - $ mvn install:install-file -Dfile="$(pwd)/arrow-vector-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-vector -Dversion=9.0.0.dev501 -Dpackaging=pom - $ mvn install:install-file -Dfile="$(pwd)/arrow-vector-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-vector -Dversion=9.0.0.dev501 -Dpackaging=jar - $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-memory -Dversion=9.0.0.dev501 -Dpackaging=pom - $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-core-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-core -Dversion=9.0.0.dev501 -Dpackaging=pom - $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-netty-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-netty -Dversion=9.0.0.dev501 -Dpackaging=pom - $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-core-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-core -Dversion=9.0.0.dev501 -Dpackaging=jar - $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-netty-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-netty -Dversion=9.0.0.dev501 -Dpackaging=jar - $ mvn install:install-file -Dfile="$(pwd)/arrow-flight-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-flight -Dversion=9.0.0.dev501 -Dpackaging=pom - $ mvn install:install-file -Dfile="$(pwd)/flight-core-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=flight-core -Dversion=9.0.0.dev501 -Dpackaging=pom - $ mvn install:install-file -Dfile="$(pwd)/flight-core-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=flight-core -Dversion=9.0.0.dev501 -Dpackaging=jar - -5. Validate that the packages were installed: - -.. code-block:: shell - - $ tree ~/.m2/repository/org/apache/arrow - . - ├── arrow-flight - │   ├── 9.0.0.dev501 - │   │   └── arrow-flight-9.0.0.dev501.pom - ├── arrow-format - │   ├── 9.0.0.dev501 - │   │   ├── arrow-format-9.0.0.dev501.jar - │   │   └── arrow-format-9.0.0.dev501.pom - ├── arrow-java-root - │   ├── 9.0.0.dev501 - │   │   └── arrow-java-root-9.0.0.dev501.pom - ├── arrow-memory - │   ├── 9.0.0.dev501 - │   │   └── arrow-memory-9.0.0.dev501.pom - ├── arrow-memory-core - │   ├── 9.0.0.dev501 - │   │   ├── arrow-memory-core-9.0.0.dev501.jar - │   │   └── arrow-memory-core-9.0.0.dev501.pom - ├── arrow-memory-netty - │   ├── 9.0.0.dev501 - │   │   ├── arrow-memory-netty-9.0.0.dev501.jar - │   │   └── arrow-memory-netty-9.0.0.dev501.pom - ├── arrow-vector - │   ├── 9.0.0.dev501 - │   │   ├── _remote.repositories - │   │   ├── arrow-vector-9.0.0.dev501.jar - │   │   └── arrow-vector-9.0.0.dev501.pom - └── flight-core - ├── 9.0.0.dev501 - │   ├── flight-core-9.0.0.dev501.jar - │   └── flight-core-9.0.0.dev501.pom - -6. Compile your project like usual with ``mvn clean install``. - -.. _builds@arrow.apache.org: https://lists.apache.org/list.html?builds@arrow.apache.org -.. _Github Nightly: https://github.com/ursacomputing/crossbow/releases/tag/nightly-packaging-2022-07-30-0-github-java-jars \ No newline at end of file diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst index d47a0970e934f..f884a9cc94b36 100644 --- a/docs/source/python/install.rst +++ b/docs/source/python/install.rst @@ -61,30 +61,3 @@ Installing from source ---------------------- See :ref:`python-development`. - -Installing Nightly Packages ---------------------------- - -.. warning:: - These packages are not official releases. Use them at your own risk. - -PyArrow has nightly wheels and conda packages for testing purposes. - -These may be suitable for downstream libraries in their continuous integration -setup to maintain compatibility with the upcoming PyArrow features, -deprecations and/or feature removals. - -Install the development version of PyArrow from `arrow-nightlies -`_ conda channel: - -.. code-block:: bash - - conda install -c arrow-nightlies pyarrow - -Install the development version from an `alternative PyPI -`_ index: - -.. code-block:: bash - - pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ \ - --prefer-binary --pre pyarrow From 51eb3c8adb5742f8d0d05c2e371dfbc651499614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99=20Moldovan-Gr=C3=BCnfeld?= Date: Tue, 2 Aug 2022 13:49:15 +0100 Subject: [PATCH 04/11] ARROW-17084: [R] Install the package before linting (#13620) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The package should be installed before running `lintr::ling_package()` or `lintr::expect_lint_free()` (our case), otherwise we could encounter some false positives. See https://github.com/r-lib/lintr/issues/352#issuecomment-587004345 and https://github.com/r-lib/lintr/issues/406#issuecomment-534601141 Authored-by: Dragoș Moldovan-Grünfeld Signed-off-by: Jonathan Keane --- .github/workflows/r.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 4a9c605e3bce0..4f706e3e5b117 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -327,6 +327,14 @@ jobs: shell: Rscript {0} working-directory: r run: | + Sys.setenv( + RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"), + MAKEFLAGS = paste0("-j", parallel::detectCores()), + ARROW_R_DEV = TRUE, + "_R_CHECK_FORCE_SUGGESTS_" = FALSE + ) + # we use pak for package installation since it is faster, safer and more convenient + pak::local_install() pak::pak("lintr") lintr::expect_lint_free() - name: Dump install logs From 2a027571d7a10526939c4c37bf84b2368e6f4b74 Mon Sep 17 00:00:00 2001 From: 0x26res Date: Tue, 2 Aug 2022 13:51:05 +0100 Subject: [PATCH 05/11] ARROW-17228: [Python] dataset.write_data should use Scanner.projected_schema when passed a scanner with projected columns (#13756) Issue: https://issues.apache.org/jira/browse/ARROW-17228 Authored-by: 0x26res <0x26res@gmail.com> Signed-off-by: David Li --- python/pyarrow/dataset.py | 2 +- python/pyarrow/tests/test_dataset.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 2518e37ec6f3d..326b37ec6e1ae 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -964,7 +964,7 @@ def file_visitor(written_file): # was converted to one of those two. So we can grab the schema # to build the partitioning object from Dataset. if isinstance(data, Scanner): - partitioning_schema = data.dataset_schema + partitioning_schema = data.projected_schema else: partitioning_schema = data.schema partitioning = _ensure_write_partitioning(partitioning, diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index b900e694a91da..3dc9c3beb6ee1 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4706,3 +4706,28 @@ def test_dataset_filter(tempdir): "colA": [1, 2], "col2": ["a", "b"] }) + + +def test_write_dataset_with_scanner_use_projected_schema(tempdir): + """ + Ensure the projected schema is used to validate partitions for scanner + + https://issues.apache.org/jira/browse/ARROW-17228 + """ + table = pa.table([pa.array(range(20))], names=["original_column"]) + table_dataset = ds.dataset(table) + columns = { + "renamed_column": ds.field("original_column"), + } + scanner = table_dataset.scanner(columns=columns) + + ds.write_dataset( + scanner, tempdir, partitioning=["renamed_column"], format="ipc") + with ( + pytest.raises( + KeyError, match=r"'Column original_column does not exist in schema" + ) + ): + ds.write_dataset( + scanner, tempdir, partitioning=["original_column"], format="ipc" + ) From 4bd3d2ecbb6f4f311d59098892b7dca5421c15e4 Mon Sep 17 00:00:00 2001 From: LouisClt Date: Tue, 2 Aug 2022 14:57:32 +0200 Subject: [PATCH 06/11] ARROW-17214: [C++] Add scalar casts to string types for list based types (#13737) Following https://lists.apache.org/thread/rp7vpjtt4lgtjxj35oyjyqh9b6on94jf discussion, here is the PR corresponding to the feature : be able to cast list-like types (maps, lists, fixed-size lists) to string type. It produces output such as : list{null, 1} map{{key:string = a, value:int64 = 2}, {key:string = b, value:int64 = 45}} map, int64>{{key:struct = {x:int16 = 884, y:bool = true}, value:int64 = 2}, {key:struct = {x:int16 = 874, y:bool = false}, value:int64 = null}} fixed_size_list[3]{4, 5, 6} I tried to be coherent with the rest of the CastTo methods. Feel free to comment. Lead-authored-by: LouisClt Co-authored-by: Louis CALOT Co-authored-by: David Li Signed-off-by: David Li --- c_glib/test/test-list-scalar.rb | 13 ++----------- c_glib/test/test-map-scalar.rb | 13 +------------ cpp/src/arrow/scalar.cc | 14 ++++++++++++++ cpp/src/arrow/scalar_test.cc | 4 ++++ 4 files changed, 21 insertions(+), 23 deletions(-) diff --git a/c_glib/test/test-list-scalar.rb b/c_glib/test/test-list-scalar.rb index 0ddbf60bc05e3..06633ee3bb48a 100644 --- a/c_glib/test/test-list-scalar.rb +++ b/c_glib/test/test-list-scalar.rb @@ -41,17 +41,8 @@ def test_equal end def test_to_s - assert_equal(<<-LIST.strip, @scalar.to_s) -[ - [ - [ - 1, - 2, - 3 - ] - ] -] - LIST + assert_equal("list>[list[1, 2, 3]]", + @scalar.to_s) end def test_value diff --git a/c_glib/test/test-map-scalar.rb b/c_glib/test/test-map-scalar.rb index 1e004569ef38e..541a7b32ea48b 100644 --- a/c_glib/test/test-map-scalar.rb +++ b/c_glib/test/test-map-scalar.rb @@ -57,18 +57,7 @@ def test_equal def test_to_s assert_equal(<<-MAP.strip, @scalar.to_s) -[ - keys: - [ - "hello", - "world" - ] - values: - [ - 1, - 2 - ] -] +map[{key:string = hello, value:int8 = 1}, {key:string = world, value:int8 = 2}] MAP end diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 21e1cdedc2a5e..5ed92f0947649 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -1041,6 +1041,20 @@ Status CastImpl(const StructScalar& from, StringScalar* to) { return Status::OK(); } +// list based types (list, large list and map (fixed sized list too)) to string +Status CastImpl(const BaseListScalar& from, StringScalar* to) { + std::stringstream ss; + ss << from.type->ToString() << "["; + for (int64_t i = 0; i < from.value->length(); i++) { + if (i > 0) ss << ", "; + ARROW_ASSIGN_OR_RAISE(auto value, from.value->GetScalar(i)); + ss << value->ToString(); + } + ss << ']'; + to->value = Buffer::FromString(ss.str()); + return Status::OK(); +} + Status CastImpl(const UnionScalar& from, StringScalar* to) { const auto& union_ty = checked_cast(*from.type); std::stringstream ss; diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 265ee3e94eb38..bf001fc6fd90e 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1049,11 +1049,15 @@ class TestListScalar : public ::testing::Test { ASSERT_OK(scalar.ValidateFull()); ASSERT_TRUE(scalar.is_valid); AssertTypeEqual(scalar.type, type_); + // list[1, 2, null] + ASSERT_THAT(scalar.ToString(), ::testing::AllOf(::testing::HasSubstr("item: int16"), + ::testing::EndsWith("[1, 2, null]"))); auto null_scalar = CheckMakeNullScalar(type_); ASSERT_OK(null_scalar->ValidateFull()); ASSERT_FALSE(null_scalar->is_valid); AssertTypeEqual(null_scalar->type, type_); + ASSERT_EQ(null_scalar->ToString(), "null"); } void TestValidateErrors() { From 9a79c67055bb82a5df4b4a7afaaf288d1c40e980 Mon Sep 17 00:00:00 2001 From: patrick <100629128+p-a-a-a-trick@users.noreply.github.com> Date: Tue, 2 Aug 2022 10:13:21 -0400 Subject: [PATCH 07/11] MINOR: [Documentation] Added DARROW_FLIGHT_SQL to C++ optional components (#13777) Authored-by: patrick <100629128+p-a-a-a-trick@users.noreply.github.com> Signed-off-by: David Li --- docs/source/developers/cpp/building.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst index 83c774c3dcca2..b988bd2eebe11 100644 --- a/docs/source/developers/cpp/building.rst +++ b/docs/source/developers/cpp/building.rst @@ -323,6 +323,7 @@ boolean flags to ``cmake``. filesystems * ``-DARROW_FLIGHT=ON``: Arrow Flight RPC system, which depends at least on gRPC +* ``-DARROW_FLIGHT_SQL=ON``: Arrow Flight SQL * ``-DARROW_GANDIVA=ON``: Gandiva expression compiler, depends on LLVM, Protocol Buffers, and re2 * ``-DARROW_GANDIVA_JAVA=ON``: Gandiva JNI bindings for Java From dd962782eed88d7c7acb798e98f1b70931e991fb Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 2 Aug 2022 16:33:43 +0200 Subject: [PATCH 08/11] ARROW-17253: [Python] Detect iterator exception instead of crashing (#13764) When `pa.array` is given a generator that raises an exception, ensure the exception is correctly detected, even when an explicit size is also given. Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/python/python_to_arrow.cc | 8 ++++++-- python/pyarrow/tests/test_convert_builtin.py | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 024c9c575c311..7a94407d2d931 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -1114,13 +1114,17 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* siz RETURN_IF_PYERROR(); for (i = 0; i < n; i++) { PyObject* item = PyIter_Next(iter); - if (!item) break; + if (!item) { + // either an error occurred or the iterator ended + RETURN_IF_PYERROR(); + break; + } PyList_SET_ITEM(lst, i, item); } // Shrink list if len(iterator) < size if (i < n && PyList_SetSlice(lst, i, n, NULL)) { Py_DECREF(lst); - return Status::UnknownError("failed to resize list"); + RETURN_IF_PYERROR(); } *seq = lst; *size = std::min(i, *size); diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 1eb1d1b08ca8e..732164f9e132e 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -125,6 +125,14 @@ def test_infinite_iterator(): assert arr1.equals(expected) +def test_failing_iterator(): + with pytest.raises(ZeroDivisionError): + pa.array((1 // 0 for x in range(10))) + # ARROW-17253 + with pytest.raises(ZeroDivisionError): + pa.array((1 // 0 for x in range(10)), size=10) + + def _as_list(xs): return xs From 901e132b05c2d2f2b805f3e7aad8f28c1b2d2260 Mon Sep 17 00:00:00 2001 From: George Godik Date: Tue, 2 Aug 2022 10:59:41 -0400 Subject: [PATCH 09/11] ARROW-17273: [Go][CSV] Add Timestamp, Date32, Date64 format support to csv.Writer (#13772) Newly supported types - Date32 - Date64 - Timestamp csv.Reader currently supports Timestamps. Not adding Date32/64 support to CSV as the default behavior will stay the same and parse as the broadest `timestamp` type https://issues.apache.org/jira/browse/ARROW-17273 Authored-by: ggodik Signed-off-by: Matt Topol --- go/arrow/csv/common.go | 1 + go/arrow/csv/writer.go | 29 +++++++++++++++++++++++++++++ go/arrow/csv/writer_test.go | 28 +++++++++++++++++++++++----- 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go index 0f1b9c4bb22de..92427cba9e45d 100644 --- a/go/arrow/csv/common.go +++ b/go/arrow/csv/common.go @@ -168,6 +168,7 @@ func validate(schema *arrow.Schema) { case *arrow.Float32Type, *arrow.Float64Type: case *arrow.StringType: case *arrow.TimestampType: + case *arrow.Date32Type, *arrow.Date64Type: default: panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, ft)) } diff --git a/go/arrow/csv/writer.go b/go/arrow/csv/writer.go index 83b8e1e0734da..a6ccf5b7fc3ba 100644 --- a/go/arrow/csv/writer.go +++ b/go/arrow/csv/writer.go @@ -188,6 +188,35 @@ func (w *Writer) Write(record arrow.Record) error { recs[i][j] = w.nullValue } } + case *arrow.Date32Type: + arr := col.(*array.Date32) + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs[i][j] = arr.Value(i).FormattedString() + } else { + recs[i][j] = w.nullValue + } + } + case *arrow.Date64Type: + arr := col.(*array.Date64) + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs[i][j] = arr.Value(i).FormattedString() + } else { + recs[i][j] = w.nullValue + } + } + + case *arrow.TimestampType: + arr := col.(*array.Timestamp) + t := w.schema.Field(j).Type.(*arrow.TimestampType) + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs[i][j] = arr.Value(i).ToTime(t.Unit).Format("2006-01-02 15:04:05.999999999") + } else { + recs[i][j] = w.nullValue + } + } } } diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go index e9cd417d28e03..31593f4969b8b 100644 --- a/go/arrow/csv/writer_test.go +++ b/go/arrow/csv/writer_test.go @@ -139,6 +139,18 @@ func TestCSVWriter(t *testing.T) { } } +func genTimestamps(unit arrow.TimeUnit) []arrow.Timestamp { + out := []arrow.Timestamp{} + for _, input := range []string{"2014-07-28 15:04:05", "2016-09-08 15:04:05", "2021-09-18 15:04:05"} { + ts, err := arrow.TimestampFromString(input, unit) + if err != nil { + panic(fmt.Errorf("could not convert %s to arrow.Timestamp err=%s", input, err)) + } + out = append(out, ts) + } + return out +} + func testCSVWriter(t *testing.T, writeHeader bool) { f := new(bytes.Buffer) @@ -158,6 +170,9 @@ func testCSVWriter(t *testing.T, writeHeader bool) { {Name: "f32", Type: arrow.PrimitiveTypes.Float32}, {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, {Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "ts_s", Type: arrow.FixedWidthTypes.Timestamp_s}, + {Name: "d32", Type: arrow.FixedWidthTypes.Date32}, + {Name: "d64", Type: arrow.FixedWidthTypes.Date64}, }, nil, ) @@ -177,6 +192,9 @@ func testCSVWriter(t *testing.T, writeHeader bool) { b.Field(9).(*array.Float32Builder).AppendValues([]float32{0.0, 0.1, 0.2}, nil) b.Field(10).(*array.Float64Builder).AppendValues([]float64{0.0, 0.1, 0.2}, nil) b.Field(11).(*array.StringBuilder).AppendValues([]string{"str-0", "str-1", "str-2"}, nil) + b.Field(12).(*array.TimestampBuilder).AppendValues(genTimestamps(arrow.Second), nil) + b.Field(13).(*array.Date32Builder).AppendValues([]arrow.Date32{17304, 19304, 20304}, nil) + b.Field(14).(*array.Date64Builder).AppendValues([]arrow.Date64{1840400000000, 1940400000000, 2040400000000}, nil) for _, field := range b.Fields() { field.AppendNull() @@ -206,14 +224,14 @@ func testCSVWriter(t *testing.T, writeHeader bool) { t.Fatal(err) } - want := `true;-1;-1;-1;-1;0;0;0;0;0;0;str-0 -false;0;0;0;0;1;1;1;1;0.1;0.1;str-1 -true;1;1;1;1;2;2;2;2;0.2;0.2;str-2 -null;null;null;null;null;null;null;null;null;null;null;null + want := `true;-1;-1;-1;-1;0;0;0;0;0;0;str-0;2014-07-28 15:04:05;2017-05-18;2028-04-26 +false;0;0;0;0;1;1;1;1;0.1;0.1;str-1;2016-09-08 15:04:05;2022-11-08;2031-06-28 +true;1;1;1;1;2;2;2;2;0.2;0.2;str-2;2021-09-18 15:04:05;2025-08-04;2034-08-28 +null;null;null;null;null;null;null;null;null;null;null;null;null;null;null ` if writeHeader { - want = "bool;i8;i16;i32;i64;u8;u16;u32;u64;f32;f64;str\n" + want + want = "bool;i8;i16;i32;i64;u8;u16;u32;u64;f32;f64;str;ts_s;d32;d64\n" + want } if got, want := f.String(), want; strings.Compare(got, want) != 0 { From d3a0ab902ae5a9312cd6219409a8e94e0a133905 Mon Sep 17 00:00:00 2001 From: George Godik Date: Tue, 2 Aug 2022 14:12:04 -0400 Subject: [PATCH 10/11] ARROW-17277: [Go][CSV] Custom csv.Writer formatter for boolean values (#13774) - Use `WithBoolWriter` to overwrite the default use of `strconv.FormatBool` - uses `strconv.FormatBool` by default - `WithBoolWriter(nil)` will not overwrite default usage ``` w := csv.NewWriter(bytes.NewBuffer(nil), schema, csv.WithBoolWriter(func(b bool) string { if b { return "HELLO" } return "WORLD" })) ``` https://issues.apache.org/jira/browse/ARROW-17277 Authored-by: ggodik Signed-off-by: Matt Topol --- go/arrow/csv/common.go | 15 +++++ go/arrow/csv/writer.go | 20 ++++--- go/arrow/csv/writer_test.go | 116 +++++++++++++++++++++++++++++------- 3 files changed, 119 insertions(+), 32 deletions(-) diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go index 92427cba9e45d..7bf3182cdd6ca 100644 --- a/go/arrow/csv/common.go +++ b/go/arrow/csv/common.go @@ -159,6 +159,21 @@ func WithNullWriter(null string) Option { } } +// WithBoolWriter override the default bool formatter with a fucntion that returns +// a string representaton of bool states. i.e. True, False, 1, 0 +func WithBoolWriter(fmtr func(bool) string) Option { + return func(cfg config) { + switch cfg := cfg.(type) { + case *Writer: + if fmtr != nil { + cfg.boolFormatter = fmtr + } + default: + panic(fmt.Errorf("arrow/csv: WithBoolWriter unknown config type %T", cfg)) + } + } +} + func validate(schema *arrow.Schema) { for i, f := range schema.Fields() { switch ft := f.Type.(type) { diff --git a/go/arrow/csv/writer.go b/go/arrow/csv/writer.go index a6ccf5b7fc3ba..d6975e741f31a 100644 --- a/go/arrow/csv/writer.go +++ b/go/arrow/csv/writer.go @@ -28,11 +28,12 @@ import ( // Writer wraps encoding/csv.Writer and writes arrow.Record based on a schema. type Writer struct { - w *csv.Writer - schema *arrow.Schema - header bool - once sync.Once - nullValue string + boolFormatter func(bool) string + header bool + nullValue string + once sync.Once + schema *arrow.Schema + w *csv.Writer } // NewWriter returns a writer that writes arrow.Records to the CSV file @@ -44,9 +45,10 @@ func NewWriter(w io.Writer, schema *arrow.Schema, opts ...Option) *Writer { validate(schema) ww := &Writer{ - w: csv.NewWriter(w), - schema: schema, - nullValue: "NULL", // override by passing WithNullWriter() as an option + boolFormatter: strconv.FormatBool, // override by passing WithBoolWriter() as an option + nullValue: "NULL", // override by passing WithNullWriter() as an option + schema: schema, + w: csv.NewWriter(w), } for _, opt := range opts { opt(ww) @@ -84,7 +86,7 @@ func (w *Writer) Write(record arrow.Record) error { arr := col.(*array.Boolean) for i := 0; i < arr.Len(); i++ { if arr.IsValid(i) { - recs[i][j] = strconv.FormatBool(arr.Value(i)) + recs[i][j] = w.boolFormatter(arr.Value(i)) } else { recs[i][j] = w.nullValue } diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go index 31593f4969b8b..d08ed7b9b25e5 100644 --- a/go/arrow/csv/writer_test.go +++ b/go/arrow/csv/writer_test.go @@ -17,11 +17,12 @@ package csv_test import ( + "bufio" "bytes" + ecsv "encoding/csv" "fmt" "io/ioutil" "log" - "strings" "testing" "github.com/apache/arrow/go/v9/arrow" @@ -30,6 +31,11 @@ import ( "github.com/apache/arrow/go/v9/arrow/memory" ) +const ( + separator = ';' + nullVal = "null" +) + func Example_writer() { f := new(bytes.Buffer) @@ -121,20 +127,54 @@ func Example_writer() { // rec[9]["str"]: ["str-9"] } +var ( + fullData = [][]string{ + {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f32", "f64", "str", "ts_s", "d32", "d64"}, + {"true", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26"}, + {"false", "0", "0", "0", "0", "1", "1", "1", "1", "0.1", "0.1", "str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28"}, + {"true", "1", "1", "1", "1", "2", "2", "2", "2", "0.2", "0.2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28"}, + {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal}, + } + bananaData = [][]string{ + {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f32", "f64", "str", "ts_s", "d32", "d64"}, + {"BANANA", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26"}, + {"MANGO", "0", "0", "0", "0", "1", "1", "1", "1", "0.1", "0.1", "str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28"}, + {"BANANA", "1", "1", "1", "1", "2", "2", "2", "2", "0.2", "0.2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28"}, + {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal}, + } +) + func TestCSVWriter(t *testing.T) { tests := []struct { - name string - header bool - }{{ - name: "Noheader", - header: false, - }, { - name: "Header", - header: true, - }} + name string + header bool + boolFormat func(bool) string + data [][]string + }{ + { + name: "Noheader", + header: false, + data: fullData[1:], + }, + { + name: "header", + header: true, + data: fullData, + }, + { + name: "Header with bool fmt", + header: true, + boolFormat: func(b bool) string { + if b { + return "BANANA" + } + return "MANGO" + }, + data: bananaData, + }} for _, test := range tests { t.Run(test.name, func(t *testing.T) { - testCSVWriter(t, test.header) + testCSVWriter(t, test.data, test.header, test.boolFormat) }) } } @@ -151,7 +191,7 @@ func genTimestamps(unit arrow.TimeUnit) []arrow.Timestamp { return out } -func testCSVWriter(t *testing.T, writeHeader bool) { +func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bool) string) { f := new(bytes.Buffer) pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) @@ -204,10 +244,11 @@ func testCSVWriter(t *testing.T, writeHeader bool) { defer rec.Release() w := csv.NewWriter(f, schema, - csv.WithComma(';'), + csv.WithComma(separator), csv.WithCRLF(false), csv.WithHeader(writeHeader), - csv.WithNullWriter("null"), + csv.WithNullWriter(nullVal), + csv.WithBoolWriter(fmtr), ) err := w.Write(rec) if err != nil { @@ -224,19 +265,48 @@ func testCSVWriter(t *testing.T, writeHeader bool) { t.Fatal(err) } - want := `true;-1;-1;-1;-1;0;0;0;0;0;0;str-0;2014-07-28 15:04:05;2017-05-18;2028-04-26 -false;0;0;0;0;1;1;1;1;0.1;0.1;str-1;2016-09-08 15:04:05;2022-11-08;2031-06-28 -true;1;1;1;1;2;2;2;2;0.2;0.2;str-2;2021-09-18 15:04:05;2025-08-04;2034-08-28 -null;null;null;null;null;null;null;null;null;null;null;null;null;null;null -` + bdata, err := expectedOutout(data) + if err != nil { + t.Fatal(err) + } - if writeHeader { - want = "bool;i8;i16;i32;i64;u8;u16;u32;u64;f32;f64;str;ts_s;d32;d64\n" + want + if err = matchCSV(bdata.Bytes(), f.Bytes()); err != nil { + t.Fatal(err) + } +} + +func expectedOutout(data [][]string) (*bytes.Buffer, error) { + b := bytes.NewBuffer(nil) + w := ecsv.NewWriter(b) + w.Comma = separator + w.UseCRLF = false + return b, w.WriteAll(data) +} + +func matchCSV(expected, test []byte) error { + expectedScanner := bufio.NewScanner(bytes.NewReader(expected)) + testScanner := bufio.NewScanner(bytes.NewReader(test)) + line := 0 + for expectedScanner.Scan() && testScanner.Scan() { + if expectedScanner.Text() != testScanner.Text() { + return fmt.Errorf("expected=%s != test=%s line=%d", expectedScanner.Text(), testScanner.Text(), line) + } + line++ } - if got, want := f.String(), want; strings.Compare(got, want) != 0 { - t.Fatalf("invalid output:\ngot=%s\nwant=%s\n", got, want) + if expectedScanner.Scan() { + return fmt.Errorf("expected unprocessed:%s", expectedScanner.Text()) } + + if testScanner.Scan() { + return fmt.Errorf("test unprocessed:%s", testScanner.Text()) + } + + if err := expectedScanner.Err(); err != nil { + return err + } + + return testScanner.Err() } func BenchmarkWrite(b *testing.B) { From a9dcaff86960f3424c25d38c55fc6f8dcf0cdb9f Mon Sep 17 00:00:00 2001 From: Ankit Gehlot <46342250+Ankit01Gehlot@users.noreply.github.com> Date: Tue, 2 Aug 2022 23:43:03 +0530 Subject: [PATCH 11/11] ARROW-17269: [Java] implemented TransferPair methods in MapVector to get correct valuevector as mapvector instead of listvector (#13776) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …rect valuevector as mapvector instead of listvector Authored-by: ankitgehlot Signed-off-by: Antoine Pitrou --- .../arrow/vector/complex/ListVector.java | 10 +- .../arrow/vector/complex/MapVector.java | 152 ++++++++++++++++++ .../apache/arrow/vector/TestMapVector.java | 23 +++ .../arrow/vector/TestSplitAndTransfer.java | 22 +++ 4 files changed, 202 insertions(+), 5 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index d8fe72a7074b1..0fa091fb0cede 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -76,13 +76,13 @@ public static ListVector empty(String name, BufferAllocator allocator) { protected ArrowBuf validityBuffer; protected UnionListReader reader; private CallBack callBack; - private final FieldType fieldType; - private int validityAllocationSizeInBytes; + protected final FieldType fieldType; + protected int validityAllocationSizeInBytes; /** * The maximum index that is actually set. */ - private int lastSet; + protected int lastSet; /** * Constructs a new instance. @@ -276,7 +276,7 @@ public boolean allocateNewSafe() { return true; } - private void allocateValidityBuffer(final long size) { + protected void allocateValidityBuffer(final long size) { final int curSize = (int) size; validityBuffer = allocator.buffer(curSize); validityBuffer.readerIndex(0); @@ -296,7 +296,7 @@ public void reAlloc() { super.reAlloc(); } - private void reallocValidityAndOffsetBuffers() { + protected void reallocValidityAndOffsetBuffers() { reallocOffsetBuffer(); reallocValidityBuffer(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index 14cba0926e193..b8f3f32a73a29 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -22,8 +22,12 @@ import java.util.List; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.complex.impl.UnionMapReader; import org.apache.arrow.vector.complex.impl.UnionMapWriter; import org.apache.arrow.vector.types.Types; @@ -32,6 +36,7 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.TransferPair; /** * A MapVector is used to store entries of key/value pairs. It is a container vector that is @@ -119,4 +124,151 @@ public UnionMapReader getReader() { public MinorType getMinorType() { return MinorType.MAP; } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return getTransferPair(ref, allocator, null); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return new TransferImpl(ref, allocator, callBack); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return new MapVector.TransferImpl((MapVector) target); + } + + private class TransferImpl implements TransferPair { + + MapVector to; + TransferPair dataTransferPair; + + public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { + this(new MapVector(name, allocator, fieldType, callBack)); + } + + public TransferImpl(MapVector to) { + this.to = to; + to.addOrGetVector(vector.getField().getFieldType()); + if (to.getDataVector() instanceof ZeroVector) { + to.addOrGetVector(vector.getField().getFieldType()); + } + dataTransferPair = getDataVector().makeTransferPair(to.getDataVector()); + } + + /** + * Transfer this vector'data to another vector. The memory associated + * with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + */ + @Override + public void transfer() { + to.clear(); + dataTransferPair.transfer(); + to.validityBuffer = transferBuffer(validityBuffer, to.allocator); + to.offsetBuffer = transferBuffer(offsetBuffer, to.allocator); + to.lastSet = lastSet; + if (valueCount > 0) { + to.setValueCount(valueCount); + } + clear(); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + */ + @Override + public void splitAndTransfer(int startIndex, int length) { + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); + final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); + final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; + to.clear(); + to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + /* splitAndTransfer offset buffer */ + for (int i = 0; i < length + 1; i++) { + final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; + to.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeOffset); + } + /* splitAndTransfer validity buffer */ + splitAndTransferValidityBuffer(startIndex, length, to); + /* splitAndTransfer data buffer */ + dataTransferPair.splitAndTransfer(startPoint, sliceLength); + to.lastSet = length - 1; + to.setValueCount(length); + } + + /* + * transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, MapVector target) { + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.getReferenceManager().release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.getReferenceManager().retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + this.to.copyFrom(from, to, MapVector.this); + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java index 9637021dbdad8..d60d5611a5f7b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; import java.util.ArrayList; @@ -1110,4 +1111,26 @@ public void testClearAndReuse() { assertEquals(55, getResultValue(resultStruct)); } } + + @Test + public void testGetTransferPair() { + try (MapVector mapVector = MapVector.empty("mapVector", allocator, false)) { + + FieldType type = new FieldType(false, ArrowType.Struct.INSTANCE, null, null); + AddOrGetResult addResult = mapVector.addOrGetVector(type); + FieldType keyType = new FieldType(false, MinorType.BIGINT.getType(), null, null); + FieldType valueType = FieldType.nullable(MinorType.FLOAT8.getType()); + addResult.getVector().addOrGet(MapVector.KEY_NAME, keyType, BigIntVector.class); + addResult.getVector().addOrGet(MapVector.VALUE_NAME, valueType, Float8Vector.class); + mapVector.allocateNew(); + mapVector.setValueCount(0); + + assertEquals(-1, mapVector.getLastSet()); + TransferPair tp = mapVector.getTransferPair(mapVector.getName(), allocator, null); + tp.transfer(); + ValueVector vector = tp.getTo(); + assertSame(vector.getClass(), mapVector.getClass()); + vector.clear(); + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java index e60b87e601974..716fa0bde454d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java @@ -29,8 +29,10 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.Struct; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.TransferPair; @@ -406,5 +408,25 @@ public void testStructVectorZeroStartIndexAndLength() { } } + @Test + public void testMapVectorZeroStartIndexAndLength() { + Map metadata = new HashMap<>(); + metadata.put("k1", "v1"); + FieldType type = new FieldType(true, new ArrowType.Map(false), null, metadata); + try (final MapVector mapVector = new MapVector("mapVec", allocator, type, null); + final MapVector newMapVector = new MapVector("newMapVec", allocator, type, null)) { + + mapVector.allocateNew(); + final int valueCount = 0; + mapVector.setValueCount(valueCount); + + final TransferPair tp = mapVector.makeTransferPair(newMapVector); + + tp.splitAndTransfer(0, 0); + assertEquals(valueCount, newMapVector.getValueCount()); + + newMapVector.clear(); + } + } }