From 4b1e17824b10fd158335f4b361e4b4bc58830a37 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 26 Nov 2024 19:23:25 -0500 Subject: [PATCH] GH-6: Add Linux test CI Fixes #6. --- .env | 52 + .github/workflows/java.yml | 73 + .gitignore | 1 + .gitmodules | 3 + arrow-format/File.fbs | 52 + arrow-format/Flight.proto | 645 ++++++ arrow-format/FlightSql.proto | 1925 +++++++++++++++++ arrow-format/Message.fbs | 157 ++ arrow-format/README.rst | 25 + arrow-format/Schema.fbs | 571 +++++ arrow-format/SparseTensor.fbs | 228 ++ arrow-format/Tensor.fbs | 54 + arrow-format/substrait/extension_types.yaml | 170 ++ ci/scripts/java_build.sh | 78 + ci/scripts/java_test.sh | 55 + dataset/pom.xml | 2 +- docker-compose.yml | 47 + flight/flight-core/pom.xml | 4 +- flight/flight-sql-jdbc-core/pom.xml | 2 +- testing | 1 + .../apache/arrow/tools/TestIntegration.java | 20 +- .../resources/integration_json_simple.json | 98 + .../resources/integration_json_struct.json | 201 ++ .../apache/arrow/vector/ipc/TestJSONFile.java | 8 +- .../resources/integration_json_struct.json | 201 ++ 25 files changed, 4653 insertions(+), 20 deletions(-) create mode 100644 .env create mode 100644 .github/workflows/java.yml create mode 100644 .gitmodules create mode 100644 arrow-format/File.fbs create mode 100644 arrow-format/Flight.proto create mode 100644 arrow-format/FlightSql.proto create mode 100644 arrow-format/Message.fbs create mode 100644 arrow-format/README.rst create mode 100644 arrow-format/Schema.fbs create mode 100644 arrow-format/SparseTensor.fbs create mode 100644 arrow-format/Tensor.fbs create mode 100644 arrow-format/substrait/extension_types.yaml create mode 100755 ci/scripts/java_build.sh create mode 100755 ci/scripts/java_test.sh create mode 100644 docker-compose.yml create mode 160000 testing create mode 100644 tools/src/test/resources/integration_json_simple.json create mode 100644 tools/src/test/resources/integration_json_struct.json create mode 100644 vector/src/test/resources/integration_json_struct.json diff --git a/.env b/.env new file mode 100644 index 000000000..959b1bfc2 --- /dev/null +++ b/.env @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# All of the following environment variables are required to set default values +# for the parameters in docker-compose.yml. + +# empty prefix means that the docker-compose configuration will use named +# volumes which potentially improves the performance on docker for macos and +# docker for windows, it also prevents the contamination of the source +# directory +# a non-empty prefix means that directories from the host are bind-mounted +# into the container, it should be set to ".docker/" on github actions to keep +# the cache plugin functional +DOCKER_VOLUME_PREFIX= + +# turn on inline build cache, this is a docker buildx feature documented +# at https://github.com/docker/buildx#--cache-tonametypetypekeyvalue +BUILDKIT_INLINE_CACHE=1 +COMPOSE_DOCKER_CLI_BUILD=1 +DOCKER_BUILDKIT=1 + +# different architecture notations +ARCH=amd64 +ARCH_ALIAS=x86_64 +ARCH_SHORT=amd64 + +# Default repository to pull and push images from +REPO=apache/arrow-dev + +# The setup attempts to generate coredumps by default, in order to disable the +# coredump generation set it to 0 +ULIMIT_CORE=-1 + +# Default versions for platforms + +# Default versions for various dependencies +JDK=11 +MAVEN=3.9.6 diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml new file mode 100644 index 000000000..8641059ff --- /dev/null +++ b/.github/workflows/java.yml @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Test + +on: + push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' + pull_request: + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +permissions: + contents: read + +env: + DOCKER_VOLUME_PREFIX: ".docker/" + +jobs: + ubuntu: + name: AMD64 Ubuntu 22.04 JDK ${{ matrix.jdk }} Maven ${{ matrix.maven }} + runs-on: ubuntu-latest + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + jdk: [11, 17, 21, 22] + maven: [3.9.6] + image: [java] + env: + JDK: ${{ matrix.jdk }} + MAVEN: ${{ matrix.maven }} + steps: + - name: Checkout Arrow + uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + with: + fetch-depth: 0 + submodules: recursive + - name: Cache Docker Volumes + uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + with: + path: .docker + key: maven-${{ hashFiles('java/**') }} + restore-keys: maven- + - name: Execute Docker Build + env: + DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + run: | + docker compose run \ + -e CI=true \ + -e "DEVELOCITY_ACCESS_KEY=$DEVELOCITY_ACCESS_KEY" \ + ${{ matrix.image }} diff --git a/.gitignore b/.gitignore index 63c90af7b..80722af45 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ arrow-git.properties cmake_install.cmake install_manifest.txt target/ +/.mvn/.develocity/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..71139e3e3 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "testing"] + path = testing + url = git@github.com:apache/arrow-testing diff --git a/arrow-format/File.fbs b/arrow-format/File.fbs new file mode 100644 index 000000000..906d494f2 --- /dev/null +++ b/arrow-format/File.fbs @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; + +namespace org.apache.arrow.flatbuf; + +/// ---------------------------------------------------------------------- +/// Arrow File metadata +/// + +table Footer { + version: org.apache.arrow.flatbuf.MetadataVersion; + + schema: org.apache.arrow.flatbuf.Schema; + + dictionaries: [ Block ]; + + recordBatches: [ Block ]; + + /// User-defined metadata + custom_metadata: [ KeyValue ]; +} + +struct Block { + + /// Index to the start of the RecordBlock (note this is past the Message header) + offset: long; + + /// Length of the metadata + metaDataLength: int; + + /// Length of the data (this is aligned so there can be a gap between this and + /// the metadata). + bodyLength: long; +} + +root_type Footer; diff --git a/arrow-format/Flight.proto b/arrow-format/Flight.proto new file mode 100644 index 000000000..f2b0f889c --- /dev/null +++ b/arrow-format/Flight.proto @@ -0,0 +1,645 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; +import "google/protobuf/timestamp.proto"; + +option java_package = "org.apache.arrow.flight.impl"; +option go_package = "github.com/apache/arrow-go/arrow/flight/gen/flight"; +option csharp_namespace = "Apache.Arrow.Flight.Protocol"; + +package arrow.flight.protocol; + +/* + * A flight service is an endpoint for retrieving or storing Arrow data. A + * flight service can expose one or more predefined endpoints that can be + * accessed using the Arrow Flight Protocol. Additionally, a flight service + * can expose a set of actions that are available. + */ +service FlightService { + + /* + * Handshake between client and server. Depending on the server, the + * handshake may be required to determine the token that should be used for + * future operations. Both request and response are streams to allow multiple + * round-trips depending on auth mechanism. + */ + rpc Handshake(stream HandshakeRequest) returns (stream HandshakeResponse) {} + + /* + * Get a list of available streams given a particular criteria. Most flight + * services will expose one or more streams that are readily available for + * retrieval. This api allows listing the streams available for + * consumption. A user can also provide a criteria. The criteria can limit + * the subset of streams that can be listed via this interface. Each flight + * service allows its own definition of how to consume criteria. + */ + rpc ListFlights(Criteria) returns (stream FlightInfo) {} + + /* + * For a given FlightDescriptor, get information about how the flight can be + * consumed. This is a useful interface if the consumer of the interface + * already can identify the specific flight to consume. This interface can + * also allow a consumer to generate a flight stream through a specified + * descriptor. For example, a flight descriptor might be something that + * includes a SQL statement or a Pickled Python operation that will be + * executed. In those cases, the descriptor will not be previously available + * within the list of available streams provided by ListFlights but will be + * available for consumption for the duration defined by the specific flight + * service. + */ + rpc GetFlightInfo(FlightDescriptor) returns (FlightInfo) {} + + /* + * For a given FlightDescriptor, start a query and get information + * to poll its execution status. This is a useful interface if the + * query may be a long-running query. The first PollFlightInfo call + * should return as quickly as possible. (GetFlightInfo doesn't + * return until the query is complete.) + * + * A client can consume any available results before + * the query is completed. See PollInfo.info for details. + * + * A client can poll the updated query status by calling + * PollFlightInfo() with PollInfo.flight_descriptor. A server + * should not respond until the result would be different from last + * time. That way, the client can "long poll" for updates + * without constantly making requests. Clients can set a short timeout + * to avoid blocking calls if desired. + * + * A client can't use PollInfo.flight_descriptor after + * PollInfo.expiration_time passes. A server might not accept the + * retry descriptor anymore and the query may be cancelled. + * + * A client may use the CancelFlightInfo action with + * PollInfo.info to cancel the running query. + */ + rpc PollFlightInfo(FlightDescriptor) returns (PollInfo) {} + + /* + * For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema + * This is used when a consumer needs the Schema of flight stream. Similar to + * GetFlightInfo this interface may generate a new flight that was not previously + * available in ListFlights. + */ + rpc GetSchema(FlightDescriptor) returns (SchemaResult) {} + + /* + * Retrieve a single stream associated with a particular descriptor + * associated with the referenced ticket. A Flight can be composed of one or + * more streams where each stream can be retrieved using a separate opaque + * ticket that the flight service uses for managing a collection of streams. + */ + rpc DoGet(Ticket) returns (stream FlightData) {} + + /* + * Push a stream to the flight service associated with a particular + * flight stream. This allows a client of a flight service to upload a stream + * of data. Depending on the particular flight service, a client consumer + * could be allowed to upload a single stream per descriptor or an unlimited + * number. In the latter, the service might implement a 'seal' action that + * can be applied to a descriptor once all streams are uploaded. + */ + rpc DoPut(stream FlightData) returns (stream PutResult) {} + + /* + * Open a bidirectional data channel for a given descriptor. This + * allows clients to send and receive arbitrary Arrow data and + * application-specific metadata in a single logical stream. In + * contrast to DoGet/DoPut, this is more suited for clients + * offloading computation (rather than storage) to a Flight service. + */ + rpc DoExchange(stream FlightData) returns (stream FlightData) {} + + /* + * Flight services can support an arbitrary number of simple actions in + * addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut + * operations that are potentially available. DoAction allows a flight client + * to do a specific action against a flight service. An action includes + * opaque request and response objects that are specific to the type action + * being undertaken. + */ + rpc DoAction(Action) returns (stream Result) {} + + /* + * A flight service exposes all of the available action types that it has + * along with descriptions. This allows different flight consumers to + * understand the capabilities of the flight service. + */ + rpc ListActions(Empty) returns (stream ActionType) {} +} + +/* + * The request that a client provides to a server on handshake. + */ +message HandshakeRequest { + + /* + * A defined protocol version + */ + uint64 protocol_version = 1; + + /* + * Arbitrary auth/handshake info. + */ + bytes payload = 2; +} + +message HandshakeResponse { + + /* + * A defined protocol version + */ + uint64 protocol_version = 1; + + /* + * Arbitrary auth/handshake info. + */ + bytes payload = 2; +} + +/* + * A message for doing simple auth. + */ +message BasicAuth { + string username = 2; + string password = 3; +} + +message Empty {} + +/* + * Describes an available action, including both the name used for execution + * along with a short description of the purpose of the action. + */ +message ActionType { + string type = 1; + string description = 2; +} + +/* + * A service specific expression that can be used to return a limited set + * of available Arrow Flight streams. + */ +message Criteria { + bytes expression = 1; +} + +/* + * An opaque action specific for the service. + */ +message Action { + string type = 1; + bytes body = 2; +} + +/* + * An opaque result returned after executing an action. + */ +message Result { + bytes body = 1; +} + +/* + * Wrap the result of a getSchema call + */ +message SchemaResult { + // The schema of the dataset in its IPC form: + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema + bytes schema = 1; +} + +/* + * The name or tag for a Flight. May be used as a way to retrieve or generate + * a flight or be used to expose a set of previously defined flights. + */ +message FlightDescriptor { + + /* + * Describes what type of descriptor is defined. + */ + enum DescriptorType { + + // Protobuf pattern, not used. + UNKNOWN = 0; + + /* + * A named path that identifies a dataset. A path is composed of a string + * or list of strings describing a particular dataset. This is conceptually + * similar to a path inside a filesystem. + */ + PATH = 1; + + /* + * An opaque command to generate a dataset. + */ + CMD = 2; + } + + DescriptorType type = 1; + + /* + * Opaque value used to express a command. Should only be defined when + * type = CMD. + */ + bytes cmd = 2; + + /* + * List of strings identifying a particular dataset. Should only be defined + * when type = PATH. + */ + repeated string path = 3; +} + +/* + * The access coordinates for retrieval of a dataset. With a FlightInfo, a + * consumer is able to determine how to retrieve a dataset. + */ +message FlightInfo { + // The schema of the dataset in its IPC form: + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema + bytes schema = 1; + + /* + * The descriptor associated with this info. + */ + FlightDescriptor flight_descriptor = 2; + + /* + * A list of endpoints associated with the flight. To consume the + * whole flight, all endpoints (and hence all Tickets) must be + * consumed. Endpoints can be consumed in any order. + * + * In other words, an application can use multiple endpoints to + * represent partitioned data. + * + * If the returned data has an ordering, an application can use + * "FlightInfo.ordered = true" or should return the all data in a + * single endpoint. Otherwise, there is no ordering defined on + * endpoints or the data within. + * + * A client can read ordered data by reading data from returned + * endpoints, in order, from front to back. + * + * Note that a client may ignore "FlightInfo.ordered = true". If an + * ordering is important for an application, an application must + * choose one of them: + * + * * An application requires that all clients must read data in + * returned endpoints order. + * * An application must return the all data in a single endpoint. + */ + repeated FlightEndpoint endpoint = 3; + + // Set these to -1 if unknown. + int64 total_records = 4; + int64 total_bytes = 5; + + /* + * FlightEndpoints are in the same order as the data. + */ + bool ordered = 6; + + /* + * Application-defined metadata. + * + * There is no inherent or required relationship between this + * and the app_metadata fields in the FlightEndpoints or resulting + * FlightData messages. Since this metadata is application-defined, + * a given application could define there to be a relationship, + * but there is none required by the spec. + */ + bytes app_metadata = 7; +} + +/* + * The information to process a long-running query. + */ +message PollInfo { + /* + * The currently available results. + * + * If "flight_descriptor" is not specified, the query is complete + * and "info" specifies all results. Otherwise, "info" contains + * partial query results. + * + * Note that each PollInfo response contains a complete + * FlightInfo (not just the delta between the previous and current + * FlightInfo). + * + * Subsequent PollInfo responses may only append new endpoints to + * info. + * + * Clients can begin fetching results via DoGet(Ticket) with the + * ticket in the info before the query is + * completed. FlightInfo.ordered is also valid. + */ + FlightInfo info = 1; + + /* + * The descriptor the client should use on the next try. + * If unset, the query is complete. + */ + FlightDescriptor flight_descriptor = 2; + + /* + * Query progress. If known, must be in [0.0, 1.0] but need not be + * monotonic or nondecreasing. If unknown, do not set. + */ + optional double progress = 3; + + /* + * Expiration time for this request. After this passes, the server + * might not accept the retry descriptor anymore (and the query may + * be cancelled). This may be updated on a call to PollFlightInfo. + */ + google.protobuf.Timestamp expiration_time = 4; +} + +/* + * The request of the CancelFlightInfo action. + * + * The request should be stored in Action.body. + */ +message CancelFlightInfoRequest { + FlightInfo info = 1; +} + +/* + * The result of a cancel operation. + * + * This is used by CancelFlightInfoResult.status. + */ +enum CancelStatus { + // The cancellation status is unknown. Servers should avoid using + // this value (send a NOT_FOUND error if the requested query is + // not known). Clients can retry the request. + CANCEL_STATUS_UNSPECIFIED = 0; + // The cancellation request is complete. Subsequent requests with + // the same payload may return CANCELLED or a NOT_FOUND error. + CANCEL_STATUS_CANCELLED = 1; + // The cancellation request is in progress. The client may retry + // the cancellation request. + CANCEL_STATUS_CANCELLING = 2; + // The query is not cancellable. The client should not retry the + // cancellation request. + CANCEL_STATUS_NOT_CANCELLABLE = 3; +} + +/* + * The result of the CancelFlightInfo action. + * + * The result should be stored in Result.body. + */ +message CancelFlightInfoResult { + CancelStatus status = 1; +} + +/* + * An opaque identifier that the service can use to retrieve a particular + * portion of a stream. + * + * Tickets are meant to be single use. It is an error/application-defined + * behavior to reuse a ticket. + */ +message Ticket { + bytes ticket = 1; +} + +/* + * A location where a Flight service will accept retrieval of a particular + * stream given a ticket. + */ +message Location { + string uri = 1; +} + +/* + * A particular stream or split associated with a flight. + */ +message FlightEndpoint { + + /* + * Token used to retrieve this stream. + */ + Ticket ticket = 1; + + /* + * A list of URIs where this ticket can be redeemed via DoGet(). + * + * If the list is empty, the expectation is that the ticket can only + * be redeemed on the current service where the ticket was + * generated. + * + * If the list is not empty, the expectation is that the ticket can be + * redeemed at any of the locations, and that the data returned will be + * equivalent. In this case, the ticket may only be redeemed at one of the + * given locations, and not (necessarily) on the current service. If one + * of the given locations is "arrow-flight-reuse-connection://?", the + * client may redeem the ticket on the service where the ticket was + * generated (i.e., the same as above), in addition to the other + * locations. (This URI was chosen to maximize compatibility, as 'scheme:' + * or 'scheme://' are not accepted by Java's java.net.URI.) + * + * In other words, an application can use multiple locations to + * represent redundant and/or load balanced services. + */ + repeated Location location = 2; + + /* + * Expiration time of this stream. If present, clients may assume + * they can retry DoGet requests. Otherwise, it is + * application-defined whether DoGet requests may be retried. + */ + google.protobuf.Timestamp expiration_time = 3; + + /* + * Application-defined metadata. + * + * There is no inherent or required relationship between this + * and the app_metadata fields in the FlightInfo or resulting + * FlightData messages. Since this metadata is application-defined, + * a given application could define there to be a relationship, + * but there is none required by the spec. + */ + bytes app_metadata = 4; +} + +/* + * The request of the RenewFlightEndpoint action. + * + * The request should be stored in Action.body. + */ +message RenewFlightEndpointRequest { + FlightEndpoint endpoint = 1; +} + +/* + * A batch of Arrow data as part of a stream of batches. + */ +message FlightData { + + /* + * The descriptor of the data. This is only relevant when a client is + * starting a new DoPut stream. + */ + FlightDescriptor flight_descriptor = 1; + + /* + * Header for message data as described in Message.fbs::Message. + */ + bytes data_header = 2; + + /* + * Application-defined metadata. + */ + bytes app_metadata = 3; + + /* + * The actual batch of Arrow data. Preferably handled with minimal-copies + * coming last in the definition to help with sidecar patterns (it is + * expected that some implementations will fetch this field off the wire + * with specialized code to avoid extra memory copies). + */ + bytes data_body = 1000; +} + +/** + * The response message associated with the submission of a DoPut. + */ +message PutResult { + bytes app_metadata = 1; +} + +/* + * EXPERIMENTAL: Union of possible value types for a Session Option to be set to. + * + * By convention, an attempt to set a valueless SessionOptionValue should + * attempt to unset or clear the named option value on the server. + */ +message SessionOptionValue { + message StringListValue { + repeated string values = 1; + } + + oneof option_value { + string string_value = 1; + bool bool_value = 2; + sfixed64 int64_value = 3; + double double_value = 4; + StringListValue string_list_value = 5; + } +} + +/* + * EXPERIMENTAL: A request to set session options for an existing or new (implicit) + * server session. + * + * Sessions are persisted and referenced via a transport-level state management, typically + * RFC 6265 HTTP cookies when using an HTTP transport. The suggested cookie name or state + * context key is 'arrow_flight_session_id', although implementations may freely choose their + * own name. + * + * Session creation (if one does not already exist) is implied by this RPC request, however + * server implementations may choose to initiate a session that also contains client-provided + * session options at any other time, e.g. on authentication, or when any other call is made + * and the server wishes to use a session to persist any state (or lack thereof). + */ +message SetSessionOptionsRequest { + map session_options = 1; +} + +/* + * EXPERIMENTAL: The results (individually) of setting a set of session options. + * + * Option names should only be present in the response if they were not successfully + * set on the server; that is, a response without an Error for a name provided in the + * SetSessionOptionsRequest implies that the named option value was set successfully. + */ +message SetSessionOptionsResult { + enum ErrorValue { + // Protobuf deserialization fallback value: The status is unknown or unrecognized. + // Servers should avoid using this value. The request may be retried by the client. + UNSPECIFIED = 0; + // The given session option name is invalid. + INVALID_NAME = 1; + // The session option value or type is invalid. + INVALID_VALUE = 2; + // The session option cannot be set. + ERROR = 3; + } + + message Error { + ErrorValue value = 1; + } + + map errors = 1; +} + +/* + * EXPERIMENTAL: A request to access the session options for the current server session. + * + * The existing session is referenced via a cookie header or similar (see + * SetSessionOptionsRequest above); it is an error to make this request with a missing, + * invalid, or expired session cookie header or other implementation-defined session + * reference token. + */ +message GetSessionOptionsRequest { +} + +/* + * EXPERIMENTAL: The result containing the current server session options. + */ +message GetSessionOptionsResult { + map session_options = 1; +} + +/* + * Request message for the "Close Session" action. + * + * The exiting session is referenced via a cookie header. + */ +message CloseSessionRequest { +} + +/* + * The result of closing a session. + */ +message CloseSessionResult { + enum Status { + // Protobuf deserialization fallback value: The session close status is unknown or + // not recognized. Servers should avoid using this value (send a NOT_FOUND error if + // the requested session is not known or expired). Clients can retry the request. + UNSPECIFIED = 0; + // The session close request is complete. Subsequent requests with + // the same session produce a NOT_FOUND error. + CLOSED = 1; + // The session close request is in progress. The client may retry + // the close request. + CLOSING = 2; + // The session is not closeable. The client should not retry the + // close request. + NOT_CLOSEABLE = 3; + } + + Status status = 1; +} diff --git a/arrow-format/FlightSql.proto b/arrow-format/FlightSql.proto new file mode 100644 index 000000000..ef1ae7513 --- /dev/null +++ b/arrow-format/FlightSql.proto @@ -0,0 +1,1925 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; +import "google/protobuf/descriptor.proto"; + +option java_package = "org.apache.arrow.flight.sql.impl"; +option go_package = "github.com/apache/arrow-go/arrow/flight/gen/flight"; +package arrow.flight.protocol.sql; + +/* + * Represents a metadata request. Used in the command member of FlightDescriptor + * for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the metadata request. + * + * The returned Arrow schema will be: + * < + * info_name: uint32 not null, + * value: dense_union< + * string_value: utf8, + * bool_value: bool, + * bigint_value: int64, + * int32_bitmask: int32, + * string_list: list + * int32_to_int32_list_map: map> + * > + * where there is one row per requested piece of metadata information. + */ +message CommandGetSqlInfo { + + /* + * Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide + * Flight SQL clients with basic, SQL syntax and SQL functions related information. + * More information types can be added in future releases. + * E.g. more SQL syntax support types, scalar functions support, type conversion support etc. + * + * Note that the set of metadata may expand. + * + * Initially, Flight SQL will support the following information types: + * - Server Information - Range [0-500) + * - Syntax Information - Range [500-1000) + * Range [0-10,000) is reserved for defaults (see SqlInfo enum for default options). + * Custom options should start at 10,000. + * + * If omitted, then all metadata will be retrieved. + * Flight SQL Servers may choose to include additional metadata above and beyond the specified set, however they must + * at least return the specified set. IDs ranging from 0 to 10,000 (exclusive) are reserved for future use. + * If additional metadata is included, the metadata IDs should start from 10,000. + */ + repeated uint32 info = 1; +} + +// Options for CommandGetSqlInfo. +enum SqlInfo { + + // Server Information [0-500): Provides basic information about the Flight SQL Server. + + // Retrieves a UTF-8 string with the name of the Flight SQL Server. + FLIGHT_SQL_SERVER_NAME = 0; + + // Retrieves a UTF-8 string with the native version of the Flight SQL Server. + FLIGHT_SQL_SERVER_VERSION = 1; + + // Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. + FLIGHT_SQL_SERVER_ARROW_VERSION = 2; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server is read only. + * + * Returns: + * - false: if read-write + * - true: if read only + */ + FLIGHT_SQL_SERVER_READ_ONLY = 3; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports executing + * SQL queries. + * + * Note that the absence of this info (as opposed to a false value) does not necessarily + * mean that SQL is not supported, as this property was not originally defined. + */ + FLIGHT_SQL_SERVER_SQL = 4; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports executing + * Substrait plans. + */ + FLIGHT_SQL_SERVER_SUBSTRAIT = 5; + + /* + * Retrieves a string value indicating the minimum supported Substrait version, or null + * if Substrait is not supported. + */ + FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION = 6; + + /* + * Retrieves a string value indicating the maximum supported Substrait version, or null + * if Substrait is not supported. + */ + FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION = 7; + + /* + * Retrieves an int32 indicating whether the Flight SQL Server supports the + * BeginTransaction/EndTransaction/BeginSavepoint/EndSavepoint actions. + * + * Even if this is not supported, the database may still support explicit "BEGIN + * TRANSACTION"/"COMMIT" SQL statements (see SQL_TRANSACTIONS_SUPPORTED); this property + * is only about whether the server implements the Flight SQL API endpoints. + * + * The possible values are listed in `SqlSupportedTransaction`. + */ + FLIGHT_SQL_SERVER_TRANSACTION = 8; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports explicit + * query cancellation (the CancelQuery action). + */ + FLIGHT_SQL_SERVER_CANCEL = 9; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports executing + * bulk ingestion. + */ + FLIGHT_SQL_SERVER_BULK_INGESTION = 10; + + /* + * Retrieves a boolean value indicating whether transactions are supported for bulk ingestion. If not, invoking + * the method commit in the context of a bulk ingestion is a noop, and the isolation level is + * `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. + * + * Returns: + * - false: if bulk ingestion transactions are unsupported; + * - true: if bulk ingestion transactions are supported. + */ + FLIGHT_SQL_SERVER_INGEST_TRANSACTIONS_SUPPORTED = 11; + + /* + * Retrieves an int32 indicating the timeout (in milliseconds) for prepared statement handles. + * + * If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. + */ + FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT = 100; + + /* + * Retrieves an int32 indicating the timeout (in milliseconds) for transactions, since transactions are not tied to a connection. + * + * If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. + */ + FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT = 101; + + // SQL Syntax Information [500-1000): provides information about SQL syntax supported by the Flight SQL Server. + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. + * + * Returns: + * - false: if it doesn't support CREATE and DROP of catalogs. + * - true: if it supports CREATE and DROP of catalogs. + */ + SQL_DDL_CATALOG = 500; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of schemas. + * + * Returns: + * - false: if it doesn't support CREATE and DROP of schemas. + * - true: if it supports CREATE and DROP of schemas. + */ + SQL_DDL_SCHEMA = 501; + + /* + * Indicates whether the Flight SQL Server supports CREATE and DROP of tables. + * + * Returns: + * - false: if it doesn't support CREATE and DROP of tables. + * - true: if it supports CREATE and DROP of tables. + */ + SQL_DDL_TABLE = 502; + + /* + * Retrieves a int32 ordinal representing the case sensitivity of catalog, table, schema and table names. + * + * The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. + */ + SQL_IDENTIFIER_CASE = 503; + + // Retrieves a UTF-8 string with the supported character(s) used to surround a delimited identifier. + SQL_IDENTIFIER_QUOTE_CHAR = 504; + + /* + * Retrieves a int32 describing the case sensitivity of quoted identifiers. + * + * The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. + */ + SQL_QUOTED_IDENTIFIER_CASE = 505; + + /* + * Retrieves a boolean value indicating whether all tables are selectable. + * + * Returns: + * - false: if not all tables are selectable or if none are; + * - true: if all tables are selectable. + */ + SQL_ALL_TABLES_ARE_SELECTABLE = 506; + + /* + * Retrieves the null ordering. + * + * Returns a int32 ordinal for the null ordering being used, as described in + * `arrow.flight.protocol.sql.SqlNullOrdering`. + */ + SQL_NULL_ORDERING = 507; + + // Retrieves a UTF-8 string list with values of the supported keywords. + SQL_KEYWORDS = 508; + + // Retrieves a UTF-8 string list with values of the supported numeric functions. + SQL_NUMERIC_FUNCTIONS = 509; + + // Retrieves a UTF-8 string list with values of the supported string functions. + SQL_STRING_FUNCTIONS = 510; + + // Retrieves a UTF-8 string list with values of the supported system functions. + SQL_SYSTEM_FUNCTIONS = 511; + + // Retrieves a UTF-8 string list with values of the supported datetime functions. + SQL_DATETIME_FUNCTIONS = 512; + + /* + * Retrieves the UTF-8 string that can be used to escape wildcard characters. + * This is the string that can be used to escape '_' or '%' in the catalog search parameters that are a pattern + * (and therefore use one of the wildcard characters). + * The '_' character represents any single character; the '%' character represents any sequence of zero or more + * characters. + */ + SQL_SEARCH_STRING_ESCAPE = 513; + + /* + * Retrieves a UTF-8 string with all the "extra" characters that can be used in unquoted identifier names + * (those beyond a-z, A-Z, 0-9 and _). + */ + SQL_EXTRA_NAME_CHARACTERS = 514; + + /* + * Retrieves a boolean value indicating whether column aliasing is supported. + * If so, the SQL AS clause can be used to provide names for computed columns or to provide alias names for columns + * as required. + * + * Returns: + * - false: if column aliasing is unsupported; + * - true: if column aliasing is supported. + */ + SQL_SUPPORTS_COLUMN_ALIASING = 515; + + /* + * Retrieves a boolean value indicating whether concatenations between null and non-null values being + * null are supported. + * + * - Returns: + * - false: if concatenations between null and non-null values being null are unsupported; + * - true: if concatenations between null and non-null values being null are supported. + */ + SQL_NULL_PLUS_NULL_IS_NULL = 516; + + /* + * Retrieves a map where the key is the type to convert from and the value is a list with the types to convert to, + * indicating the supported conversions. Each key and each item on the list value is a value to a predefined type on + * SqlSupportsConvert enum. + * The returned map will be: map> + */ + SQL_SUPPORTS_CONVERT = 517; + + /* + * Retrieves a boolean value indicating whether, when table correlation names are supported, + * they are restricted to being different from the names of the tables. + * + * Returns: + * - false: if table correlation names are unsupported; + * - true: if table correlation names are supported. + */ + SQL_SUPPORTS_TABLE_CORRELATION_NAMES = 518; + + /* + * Retrieves a boolean value indicating whether, when table correlation names are supported, + * they are restricted to being different from the names of the tables. + * + * Returns: + * - false: if different table correlation names are unsupported; + * - true: if different table correlation names are supported + */ + SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES = 519; + + /* + * Retrieves a boolean value indicating whether expressions in ORDER BY lists are supported. + * + * Returns: + * - false: if expressions in ORDER BY are unsupported; + * - true: if expressions in ORDER BY are supported; + */ + SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY = 520; + + /* + * Retrieves a boolean value indicating whether using a column that is not in the SELECT statement in a GROUP BY + * clause is supported. + * + * Returns: + * - false: if using a column that is not in the SELECT statement in a GROUP BY clause is unsupported; + * - true: if using a column that is not in the SELECT statement in a GROUP BY clause is supported. + */ + SQL_SUPPORTS_ORDER_BY_UNRELATED = 521; + + /* + * Retrieves the supported GROUP BY commands; + * + * Returns an int32 bitmask value representing the supported commands. + * The returned bitmask should be parsed in order to retrieve the supported commands. + * + * For instance: + * - return 0 (\b0) => [] (GROUP BY is unsupported); + * - return 1 (\b1) => [SQL_GROUP_BY_UNRELATED]; + * - return 2 (\b10) => [SQL_GROUP_BY_BEYOND_SELECT]; + * - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. + * Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. + */ + SQL_SUPPORTED_GROUP_BY = 522; + + /* + * Retrieves a boolean value indicating whether specifying a LIKE escape clause is supported. + * + * Returns: + * - false: if specifying a LIKE escape clause is unsupported; + * - true: if specifying a LIKE escape clause is supported. + */ + SQL_SUPPORTS_LIKE_ESCAPE_CLAUSE = 523; + + /* + * Retrieves a boolean value indicating whether columns may be defined as non-nullable. + * + * Returns: + * - false: if columns cannot be defined as non-nullable; + * - true: if columns may be defined as non-nullable. + */ + SQL_SUPPORTS_NON_NULLABLE_COLUMNS = 524; + + /* + * Retrieves the supported SQL grammar level as per the ODBC specification. + * + * Returns an int32 bitmask value representing the supported SQL grammar level. + * The returned bitmask should be parsed in order to retrieve the supported grammar levels. + * + * For instance: + * - return 0 (\b0) => [] (SQL grammar is unsupported); + * - return 1 (\b1) => [SQL_MINIMUM_GRAMMAR]; + * - return 2 (\b10) => [SQL_CORE_GRAMMAR]; + * - return 3 (\b11) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR]; + * - return 4 (\b100) => [SQL_EXTENDED_GRAMMAR]; + * - return 5 (\b101) => [SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR]; + * - return 6 (\b110) => [SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]; + * - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. + * Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. + */ + SQL_SUPPORTED_GRAMMAR = 525; + + /* + * Retrieves the supported ANSI92 SQL grammar level. + * + * Returns an int32 bitmask value representing the supported ANSI92 SQL grammar level. + * The returned bitmask should be parsed in order to retrieve the supported commands. + * + * For instance: + * - return 0 (\b0) => [] (ANSI92 SQL grammar is unsupported); + * - return 1 (\b1) => [ANSI92_ENTRY_SQL]; + * - return 2 (\b10) => [ANSI92_INTERMEDIATE_SQL]; + * - return 3 (\b11) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL]; + * - return 4 (\b100) => [ANSI92_FULL_SQL]; + * - return 5 (\b101) => [ANSI92_ENTRY_SQL, ANSI92_FULL_SQL]; + * - return 6 (\b110) => [ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]; + * - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. + * Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. + */ + SQL_ANSI92_SUPPORTED_LEVEL = 526; + + /* + * Retrieves a boolean value indicating whether the SQL Integrity Enhancement Facility is supported. + * + * Returns: + * - false: if the SQL Integrity Enhancement Facility is supported; + * - true: if the SQL Integrity Enhancement Facility is supported. + */ + SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY = 527; + + /* + * Retrieves the support level for SQL OUTER JOINs. + * + * Returns a int32 ordinal for the SQL ordering being used, as described in + * `arrow.flight.protocol.sql.SqlOuterJoinsSupportLevel`. + */ + SQL_OUTER_JOINS_SUPPORT_LEVEL = 528; + + // Retrieves a UTF-8 string with the preferred term for "schema". + SQL_SCHEMA_TERM = 529; + + // Retrieves a UTF-8 string with the preferred term for "procedure". + SQL_PROCEDURE_TERM = 530; + + /* + * Retrieves a UTF-8 string with the preferred term for "catalog". + * If a empty string is returned its assumed that the server does NOT supports catalogs. + */ + SQL_CATALOG_TERM = 531; + + /* + * Retrieves a boolean value indicating whether a catalog appears at the start of a fully qualified table name. + * + * - false: if a catalog does not appear at the start of a fully qualified table name; + * - true: if a catalog appears at the start of a fully qualified table name. + */ + SQL_CATALOG_AT_START = 532; + + /* + * Retrieves the supported actions for a SQL schema. + * + * Returns an int32 bitmask value representing the supported actions for a SQL schema. + * The returned bitmask should be parsed in order to retrieve the supported actions for a SQL schema. + * + * For instance: + * - return 0 (\b0) => [] (no supported actions for SQL schema); + * - return 1 (\b1) => [SQL_ELEMENT_IN_PROCEDURE_CALLS]; + * - return 2 (\b10) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + * - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + * - return 4 (\b100) => [SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + * Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. + */ + SQL_SCHEMAS_SUPPORTED_ACTIONS = 533; + + /* + * Retrieves the supported actions for a SQL schema. + * + * Returns an int32 bitmask value representing the supported actions for a SQL catalog. + * The returned bitmask should be parsed in order to retrieve the supported actions for a SQL catalog. + * + * For instance: + * - return 0 (\b0) => [] (no supported actions for SQL catalog); + * - return 1 (\b1) => [SQL_ELEMENT_IN_PROCEDURE_CALLS]; + * - return 2 (\b10) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + * - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + * - return 4 (\b100) => [SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + * Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. + */ + SQL_CATALOGS_SUPPORTED_ACTIONS = 534; + + /* + * Retrieves the supported SQL positioned commands. + * + * Returns an int32 bitmask value representing the supported SQL positioned commands. + * The returned bitmask should be parsed in order to retrieve the supported SQL positioned commands. + * + * For instance: + * - return 0 (\b0) => [] (no supported SQL positioned commands); + * - return 1 (\b1) => [SQL_POSITIONED_DELETE]; + * - return 2 (\b10) => [SQL_POSITIONED_UPDATE]; + * - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. + * Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. + */ + SQL_SUPPORTED_POSITIONED_COMMANDS = 535; + + /* + * Retrieves a boolean value indicating whether SELECT FOR UPDATE statements are supported. + * + * Returns: + * - false: if SELECT FOR UPDATE statements are unsupported; + * - true: if SELECT FOR UPDATE statements are supported. + */ + SQL_SELECT_FOR_UPDATE_SUPPORTED = 536; + + /* + * Retrieves a boolean value indicating whether stored procedure calls that use the stored procedure escape syntax + * are supported. + * + * Returns: + * - false: if stored procedure calls that use the stored procedure escape syntax are unsupported; + * - true: if stored procedure calls that use the stored procedure escape syntax are supported. + */ + SQL_STORED_PROCEDURES_SUPPORTED = 537; + + /* + * Retrieves the supported SQL subqueries. + * + * Returns an int32 bitmask value representing the supported SQL subqueries. + * The returned bitmask should be parsed in order to retrieve the supported SQL subqueries. + * + * For instance: + * - return 0 (\b0) => [] (no supported SQL subqueries); + * - return 1 (\b1) => [SQL_SUBQUERIES_IN_COMPARISONS]; + * - return 2 (\b10) => [SQL_SUBQUERIES_IN_EXISTS]; + * - return 3 (\b11) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS]; + * - return 4 (\b100) => [SQL_SUBQUERIES_IN_INS]; + * - return 5 (\b101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS]; + * - return 6 (\b110) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS]; + * - return 7 (\b111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS]; + * - return 8 (\b1000) => [SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 9 (\b1001) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 10 (\b1010) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 11 (\b1011) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 12 (\b1100) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 13 (\b1101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 14 (\b1110) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 15 (\b1111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - ... + * Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. + */ + SQL_SUPPORTED_SUBQUERIES = 538; + + /* + * Retrieves a boolean value indicating whether correlated subqueries are supported. + * + * Returns: + * - false: if correlated subqueries are unsupported; + * - true: if correlated subqueries are supported. + */ + SQL_CORRELATED_SUBQUERIES_SUPPORTED = 539; + + /* + * Retrieves the supported SQL UNIONs. + * + * Returns an int32 bitmask value representing the supported SQL UNIONs. + * The returned bitmask should be parsed in order to retrieve the supported SQL UNIONs. + * + * For instance: + * - return 0 (\b0) => [] (no supported SQL positioned commands); + * - return 1 (\b1) => [SQL_UNION]; + * - return 2 (\b10) => [SQL_UNION_ALL]; + * - return 3 (\b11) => [SQL_UNION, SQL_UNION_ALL]. + * Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedUnions`. + */ + SQL_SUPPORTED_UNIONS = 540; + + // Retrieves a int64 value representing the maximum number of hex characters allowed in an inline binary literal. + SQL_MAX_BINARY_LITERAL_LENGTH = 541; + + // Retrieves a int64 value representing the maximum number of characters allowed for a character literal. + SQL_MAX_CHAR_LITERAL_LENGTH = 542; + + // Retrieves a int64 value representing the maximum number of characters allowed for a column name. + SQL_MAX_COLUMN_NAME_LENGTH = 543; + + // Retrieves a int64 value representing the maximum number of columns allowed in a GROUP BY clause. + SQL_MAX_COLUMNS_IN_GROUP_BY = 544; + + // Retrieves a int64 value representing the maximum number of columns allowed in an index. + SQL_MAX_COLUMNS_IN_INDEX = 545; + + // Retrieves a int64 value representing the maximum number of columns allowed in an ORDER BY clause. + SQL_MAX_COLUMNS_IN_ORDER_BY = 546; + + // Retrieves a int64 value representing the maximum number of columns allowed in a SELECT list. + SQL_MAX_COLUMNS_IN_SELECT = 547; + + // Retrieves a int64 value representing the maximum number of columns allowed in a table. + SQL_MAX_COLUMNS_IN_TABLE = 548; + + // Retrieves a int64 value representing the maximum number of concurrent connections possible. + SQL_MAX_CONNECTIONS = 549; + + // Retrieves a int64 value the maximum number of characters allowed in a cursor name. + SQL_MAX_CURSOR_NAME_LENGTH = 550; + + /* + * Retrieves a int64 value representing the maximum number of bytes allowed for an index, + * including all of the parts of the index. + */ + SQL_MAX_INDEX_LENGTH = 551; + + // Retrieves a int64 value representing the maximum number of characters allowed in a schema name. + SQL_DB_SCHEMA_NAME_LENGTH = 552; + + // Retrieves a int64 value representing the maximum number of characters allowed in a procedure name. + SQL_MAX_PROCEDURE_NAME_LENGTH = 553; + + // Retrieves a int64 value representing the maximum number of characters allowed in a catalog name. + SQL_MAX_CATALOG_NAME_LENGTH = 554; + + // Retrieves a int64 value representing the maximum number of bytes allowed in a single row. + SQL_MAX_ROW_SIZE = 555; + + /* + * Retrieves a boolean indicating whether the return value for the JDBC method getMaxRowSize includes the SQL + * data types LONGVARCHAR and LONGVARBINARY. + * + * Returns: + * - false: if return value for the JDBC method getMaxRowSize does + * not include the SQL data types LONGVARCHAR and LONGVARBINARY; + * - true: if return value for the JDBC method getMaxRowSize includes + * the SQL data types LONGVARCHAR and LONGVARBINARY. + */ + SQL_MAX_ROW_SIZE_INCLUDES_BLOBS = 556; + + /* + * Retrieves a int64 value representing the maximum number of characters allowed for an SQL statement; + * a result of 0 (zero) means that there is no limit or the limit is not known. + */ + SQL_MAX_STATEMENT_LENGTH = 557; + + // Retrieves a int64 value representing the maximum number of active statements that can be open at the same time. + SQL_MAX_STATEMENTS = 558; + + // Retrieves a int64 value representing the maximum number of characters allowed in a table name. + SQL_MAX_TABLE_NAME_LENGTH = 559; + + // Retrieves a int64 value representing the maximum number of tables allowed in a SELECT statement. + SQL_MAX_TABLES_IN_SELECT = 560; + + // Retrieves a int64 value representing the maximum number of characters allowed in a user name. + SQL_MAX_USERNAME_LENGTH = 561; + + /* + * Retrieves this database's default transaction isolation level as described in + * `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. + * + * Returns a int32 ordinal for the SQL transaction isolation level. + */ + SQL_DEFAULT_TRANSACTION_ISOLATION = 562; + + /* + * Retrieves a boolean value indicating whether transactions are supported. If not, invoking the method commit is a + * noop, and the isolation level is `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. + * + * Returns: + * - false: if transactions are unsupported; + * - true: if transactions are supported. + */ + SQL_TRANSACTIONS_SUPPORTED = 563; + + /* + * Retrieves the supported transactions isolation levels. + * + * Returns an int32 bitmask value representing the supported transactions isolation levels. + * The returned bitmask should be parsed in order to retrieve the supported transactions isolation levels. + * + * For instance: + * - return 0 (\b0) => [] (no supported SQL transactions isolation levels); + * - return 1 (\b1) => [SQL_TRANSACTION_NONE]; + * - return 2 (\b10) => [SQL_TRANSACTION_READ_UNCOMMITTED]; + * - return 3 (\b11) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED]; + * - return 4 (\b100) => [SQL_TRANSACTION_REPEATABLE_READ]; + * - return 5 (\b101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 6 (\b110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 7 (\b111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 8 (\b1000) => [SQL_TRANSACTION_REPEATABLE_READ]; + * - return 9 (\b1001) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 10 (\b1010) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 11 (\b1011) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 12 (\b1100) => [SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 13 (\b1101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 14 (\b1110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 15 (\b1111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 16 (\b10000) => [SQL_TRANSACTION_SERIALIZABLE]; + * - ... + * Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. + */ + SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS = 564; + + /* + * Retrieves a boolean value indicating whether a data definition statement within a transaction forces + * the transaction to commit. + * + * Returns: + * - false: if a data definition statement within a transaction does not force the transaction to commit; + * - true: if a data definition statement within a transaction forces the transaction to commit. + */ + SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT = 565; + + /* + * Retrieves a boolean value indicating whether a data definition statement within a transaction is ignored. + * + * Returns: + * - false: if a data definition statement within a transaction is taken into account; + * - true: a data definition statement within a transaction is ignored. + */ + SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED = 566; + + /* + * Retrieves an int32 bitmask value representing the supported result set types. + * The returned bitmask should be parsed in order to retrieve the supported result set types. + * + * For instance: + * - return 0 (\b0) => [] (no supported result set types); + * - return 1 (\b1) => [SQL_RESULT_SET_TYPE_UNSPECIFIED]; + * - return 2 (\b10) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY]; + * - return 3 (\b11) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY]; + * - return 4 (\b100) => [SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + * - return 5 (\b101) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + * - return 6 (\b110) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + * - return 7 (\b111) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + * - return 8 (\b1000) => [SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE]; + * - ... + * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. + */ + SQL_SUPPORTED_RESULT_SET_TYPES = 567; + + /* + * Returns an int32 bitmask value concurrency types supported for + * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. + * + * For instance: + * - return 0 (\b0) => [] (no supported concurrency types for this result set type) + * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] + * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + */ + SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED = 568; + + /* + * Returns an int32 bitmask value concurrency types supported for + * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. + * + * For instance: + * - return 0 (\b0) => [] (no supported concurrency types for this result set type) + * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] + * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + */ + SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY = 569; + + /* + * Returns an int32 bitmask value concurrency types supported for + * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. + * + * For instance: + * - return 0 (\b0) => [] (no supported concurrency types for this result set type) + * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] + * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + */ + SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE = 570; + + /* + * Returns an int32 bitmask value concurrency types supported for + * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. + * + * For instance: + * - return 0 (\b0) => [] (no supported concurrency types for this result set type) + * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] + * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + */ + SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE = 571; + + /* + * Retrieves a boolean value indicating whether this database supports batch updates. + * + * - false: if this database does not support batch updates; + * - true: if this database supports batch updates. + */ + SQL_BATCH_UPDATES_SUPPORTED = 572; + + /* + * Retrieves a boolean value indicating whether this database supports savepoints. + * + * Returns: + * - false: if this database does not support savepoints; + * - true: if this database supports savepoints. + */ + SQL_SAVEPOINTS_SUPPORTED = 573; + + /* + * Retrieves a boolean value indicating whether named parameters are supported in callable statements. + * + * Returns: + * - false: if named parameters in callable statements are unsupported; + * - true: if named parameters in callable statements are supported. + */ + SQL_NAMED_PARAMETERS_SUPPORTED = 574; + + /* + * Retrieves a boolean value indicating whether updates made to a LOB are made on a copy or directly to the LOB. + * + * Returns: + * - false: if updates made to a LOB are made directly to the LOB; + * - true: if updates made to a LOB are made on a copy. + */ + SQL_LOCATORS_UPDATE_COPY = 575; + + /* + * Retrieves a boolean value indicating whether invoking user-defined or vendor functions + * using the stored procedure escape syntax is supported. + * + * Returns: + * - false: if invoking user-defined or vendor functions using the stored procedure escape syntax is unsupported; + * - true: if invoking user-defined or vendor functions using the stored procedure escape syntax is supported. + */ + SQL_STORED_FUNCTIONS_USING_CALL_SYNTAX_SUPPORTED = 576; +} + +// The level of support for Flight SQL transaction RPCs. +enum SqlSupportedTransaction { + // Unknown/not indicated/no support + SQL_SUPPORTED_TRANSACTION_NONE = 0; + // Transactions, but not savepoints. + // A savepoint is a mark within a transaction that can be individually + // rolled back to. Not all databases support savepoints. + SQL_SUPPORTED_TRANSACTION_TRANSACTION = 1; + // Transactions and savepoints + SQL_SUPPORTED_TRANSACTION_SAVEPOINT = 2; +} + +enum SqlSupportedCaseSensitivity { + SQL_CASE_SENSITIVITY_UNKNOWN = 0; + SQL_CASE_SENSITIVITY_CASE_INSENSITIVE = 1; + SQL_CASE_SENSITIVITY_UPPERCASE = 2; + SQL_CASE_SENSITIVITY_LOWERCASE = 3; +} + +enum SqlNullOrdering { + SQL_NULLS_SORTED_HIGH = 0; + SQL_NULLS_SORTED_LOW = 1; + SQL_NULLS_SORTED_AT_START = 2; + SQL_NULLS_SORTED_AT_END = 3; +} + +enum SupportedSqlGrammar { + SQL_MINIMUM_GRAMMAR = 0; + SQL_CORE_GRAMMAR = 1; + SQL_EXTENDED_GRAMMAR = 2; +} + +enum SupportedAnsi92SqlGrammarLevel { + ANSI92_ENTRY_SQL = 0; + ANSI92_INTERMEDIATE_SQL = 1; + ANSI92_FULL_SQL = 2; +} + +enum SqlOuterJoinsSupportLevel { + SQL_JOINS_UNSUPPORTED = 0; + SQL_LIMITED_OUTER_JOINS = 1; + SQL_FULL_OUTER_JOINS = 2; +} + +enum SqlSupportedGroupBy { + SQL_GROUP_BY_UNRELATED = 0; + SQL_GROUP_BY_BEYOND_SELECT = 1; +} + +enum SqlSupportedElementActions { + SQL_ELEMENT_IN_PROCEDURE_CALLS = 0; + SQL_ELEMENT_IN_INDEX_DEFINITIONS = 1; + SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS = 2; +} + +enum SqlSupportedPositionedCommands { + SQL_POSITIONED_DELETE = 0; + SQL_POSITIONED_UPDATE = 1; +} + +enum SqlSupportedSubqueries { + SQL_SUBQUERIES_IN_COMPARISONS = 0; + SQL_SUBQUERIES_IN_EXISTS = 1; + SQL_SUBQUERIES_IN_INS = 2; + SQL_SUBQUERIES_IN_QUANTIFIEDS = 3; +} + +enum SqlSupportedUnions { + SQL_UNION = 0; + SQL_UNION_ALL = 1; +} + +enum SqlTransactionIsolationLevel { + SQL_TRANSACTION_NONE = 0; + SQL_TRANSACTION_READ_UNCOMMITTED = 1; + SQL_TRANSACTION_READ_COMMITTED = 2; + SQL_TRANSACTION_REPEATABLE_READ = 3; + SQL_TRANSACTION_SERIALIZABLE = 4; +} + +enum SqlSupportedTransactions { + SQL_TRANSACTION_UNSPECIFIED = 0; + SQL_DATA_DEFINITION_TRANSACTIONS = 1; + SQL_DATA_MANIPULATION_TRANSACTIONS = 2; +} + +enum SqlSupportedResultSetType { + SQL_RESULT_SET_TYPE_UNSPECIFIED = 0; + SQL_RESULT_SET_TYPE_FORWARD_ONLY = 1; + SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE = 2; + SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE = 3; +} + +enum SqlSupportedResultSetConcurrency { + SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED = 0; + SQL_RESULT_SET_CONCURRENCY_READ_ONLY = 1; + SQL_RESULT_SET_CONCURRENCY_UPDATABLE = 2; +} + +enum SqlSupportsConvert { + SQL_CONVERT_BIGINT = 0; + SQL_CONVERT_BINARY = 1; + SQL_CONVERT_BIT = 2; + SQL_CONVERT_CHAR = 3; + SQL_CONVERT_DATE = 4; + SQL_CONVERT_DECIMAL = 5; + SQL_CONVERT_FLOAT = 6; + SQL_CONVERT_INTEGER = 7; + SQL_CONVERT_INTERVAL_DAY_TIME = 8; + SQL_CONVERT_INTERVAL_YEAR_MONTH = 9; + SQL_CONVERT_LONGVARBINARY = 10; + SQL_CONVERT_LONGVARCHAR = 11; + SQL_CONVERT_NUMERIC = 12; + SQL_CONVERT_REAL = 13; + SQL_CONVERT_SMALLINT = 14; + SQL_CONVERT_TIME = 15; + SQL_CONVERT_TIMESTAMP = 16; + SQL_CONVERT_TINYINT = 17; + SQL_CONVERT_VARBINARY = 18; + SQL_CONVERT_VARCHAR = 19; +} + +/** + * The JDBC/ODBC-defined type of any object. + * All the values here are the same as in the JDBC and ODBC specs. + */ +enum XdbcDataType { + XDBC_UNKNOWN_TYPE = 0; + XDBC_CHAR = 1; + XDBC_NUMERIC = 2; + XDBC_DECIMAL = 3; + XDBC_INTEGER = 4; + XDBC_SMALLINT = 5; + XDBC_FLOAT = 6; + XDBC_REAL = 7; + XDBC_DOUBLE = 8; + XDBC_DATETIME = 9; + XDBC_INTERVAL = 10; + XDBC_VARCHAR = 12; + XDBC_DATE = 91; + XDBC_TIME = 92; + XDBC_TIMESTAMP = 93; + XDBC_LONGVARCHAR = -1; + XDBC_BINARY = -2; + XDBC_VARBINARY = -3; + XDBC_LONGVARBINARY = -4; + XDBC_BIGINT = -5; + XDBC_TINYINT = -6; + XDBC_BIT = -7; + XDBC_WCHAR = -8; + XDBC_WVARCHAR = -9; +} + +/** + * Detailed subtype information for XDBC_TYPE_DATETIME and XDBC_TYPE_INTERVAL. + */ +enum XdbcDatetimeSubcode { + option allow_alias = true; + XDBC_SUBCODE_UNKNOWN = 0; + XDBC_SUBCODE_YEAR = 1; + XDBC_SUBCODE_DATE = 1; + XDBC_SUBCODE_TIME = 2; + XDBC_SUBCODE_MONTH = 2; + XDBC_SUBCODE_TIMESTAMP = 3; + XDBC_SUBCODE_DAY = 3; + XDBC_SUBCODE_TIME_WITH_TIMEZONE = 4; + XDBC_SUBCODE_HOUR = 4; + XDBC_SUBCODE_TIMESTAMP_WITH_TIMEZONE = 5; + XDBC_SUBCODE_MINUTE = 5; + XDBC_SUBCODE_SECOND = 6; + XDBC_SUBCODE_YEAR_TO_MONTH = 7; + XDBC_SUBCODE_DAY_TO_HOUR = 8; + XDBC_SUBCODE_DAY_TO_MINUTE = 9; + XDBC_SUBCODE_DAY_TO_SECOND = 10; + XDBC_SUBCODE_HOUR_TO_MINUTE = 11; + XDBC_SUBCODE_HOUR_TO_SECOND = 12; + XDBC_SUBCODE_MINUTE_TO_SECOND = 13; + XDBC_SUBCODE_INTERVAL_YEAR = 101; + XDBC_SUBCODE_INTERVAL_MONTH = 102; + XDBC_SUBCODE_INTERVAL_DAY = 103; + XDBC_SUBCODE_INTERVAL_HOUR = 104; + XDBC_SUBCODE_INTERVAL_MINUTE = 105; + XDBC_SUBCODE_INTERVAL_SECOND = 106; + XDBC_SUBCODE_INTERVAL_YEAR_TO_MONTH = 107; + XDBC_SUBCODE_INTERVAL_DAY_TO_HOUR = 108; + XDBC_SUBCODE_INTERVAL_DAY_TO_MINUTE = 109; + XDBC_SUBCODE_INTERVAL_DAY_TO_SECOND = 110; + XDBC_SUBCODE_INTERVAL_HOUR_TO_MINUTE = 111; + XDBC_SUBCODE_INTERVAL_HOUR_TO_SECOND = 112; + XDBC_SUBCODE_INTERVAL_MINUTE_TO_SECOND = 113; +} + +enum Nullable { + /** + * Indicates that the fields does not allow the use of null values. + */ + NULLABILITY_NO_NULLS = 0; + + /** + * Indicates that the fields allow the use of null values. + */ + NULLABILITY_NULLABLE = 1; + + /** + * Indicates that nullability of the fields cannot be determined. + */ + NULLABILITY_UNKNOWN = 2; +} + +enum Searchable { + /** + * Indicates that column cannot be used in a WHERE clause. + */ + SEARCHABLE_NONE = 0; + + /** + * Indicates that the column can be used in a WHERE clause if it is using a + * LIKE operator. + */ + SEARCHABLE_CHAR = 1; + + /** + * Indicates that the column can be used In a WHERE clause with any + * operator other than LIKE. + * + * - Allowed operators: comparison, quantified comparison, BETWEEN, + * DISTINCT, IN, MATCH, and UNIQUE. + */ + SEARCHABLE_BASIC = 2; + + /** + * Indicates that the column can be used in a WHERE clause using any operator. + */ + SEARCHABLE_FULL = 3; +} + +/* + * Represents a request to retrieve information about data type supported on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned schema will be: + * < + * type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), + * data_type: int32 not null (The SQL data type), + * column_size: int32 (The maximum size supported by that column. + * In case of exact numeric types, this represents the maximum precision. + * In case of string types, this represents the character length. + * In case of datetime data types, this represents the length in characters of the string representation. + * NULL is returned for data types where column size is not applicable.), + * literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for + * data types where a literal prefix is not applicable.), + * literal_suffix: utf8 (Character or characters used to terminate a literal, + * NULL is returned for data types where a literal suffix is not applicable.), + * create_params: list + * (A list of keywords corresponding to which parameters can be used when creating + * a column for that specific type. + * NULL is returned if there are no parameters for the data type definition.), + * nullable: int32 not null (Shows if the data type accepts a NULL value. The possible values can be seen in the + * Nullable enum.), + * case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), + * searchable: int32 not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the + * Searchable enum.), + * unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is + * not applicable to the data type or the data type is not numeric.), + * fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), + * auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute + * is not applicable to the data type or the data type is not numeric.), + * local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL + * is returned if a localized name is not supported by the data source), + * minimum_scale: int32 (The minimum scale of the data type on the data source. + * If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE + * columns both contain this value. NULL is returned if scale is not applicable.), + * maximum_scale: int32 (The maximum scale of the data type on the data source. + * NULL is returned if scale is not applicable.), + * sql_data_type: int32 not null (The value of the SQL DATA TYPE which has the same values + * as data_type value. Except for interval and datetime, which + * uses generic values. More info about those types can be + * obtained through datetime_subcode. The possible values can be seen + * in the XdbcDataType enum.), + * datetime_subcode: int32 (Only used when the SQL DATA TYPE is interval or datetime. It contains + * its sub types. For type different from interval and datetime, this value + * is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), + * num_prec_radix: int32 (If the data type is an approximate numeric type, this column contains + * the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For + * exact numeric types, this column contains the value 10 to indicate that + * column size specifies a number of decimal digits. Otherwise, this column is NULL.), + * interval_precision: int32 (If the data type is an interval data type, then this column contains the value + * of the interval leading precision. Otherwise, this column is NULL. This fields + * is only relevant to be used by ODBC). + * > + * The returned data should be ordered by data_type and then by type_name. + */ +message CommandGetXdbcTypeInfo { + + /* + * Specifies the data type to search for the info. + */ + optional int32 data_type = 1; +} + +/* + * Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. + * The definition of a catalog depends on vendor/implementation. It is usually the database itself + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * catalog_name: utf8 not null + * > + * The returned data should be ordered by catalog_name. + */ +message CommandGetCatalogs { +} + +/* + * Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. + * The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * catalog_name: utf8, + * db_schema_name: utf8 not null + * > + * The returned data should be ordered by catalog_name, then db_schema_name. + */ +message CommandGetDbSchemas { + + /* + * Specifies the Catalog to search for the tables. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string catalog = 1; + + /* + * Specifies a filter pattern for schemas to search for. + * When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search. + * In the pattern string, two special characters can be used to denote matching rules: + * - "%" means to match any substring with 0 or more characters. + * - "_" means to match any one character. + */ + optional string db_schema_filter_pattern = 2; +} + +/* + * Represents a request to retrieve the list of tables, and optionally their schemas, on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * catalog_name: utf8, + * db_schema_name: utf8, + * table_name: utf8 not null, + * table_type: utf8 not null, + * [optional] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, + * it is serialized as an IPC message.) + * > + * Fields on table_schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. + * The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. + */ +message CommandGetTables { + + /* + * Specifies the Catalog to search for the tables. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string catalog = 1; + + /* + * Specifies a filter pattern for schemas to search for. + * When no db_schema_filter_pattern is provided, all schemas matching other filters are searched. + * In the pattern string, two special characters can be used to denote matching rules: + * - "%" means to match any substring with 0 or more characters. + * - "_" means to match any one character. + */ + optional string db_schema_filter_pattern = 2; + + /* + * Specifies a filter pattern for tables to search for. + * When no table_name_filter_pattern is provided, all tables matching other filters are searched. + * In the pattern string, two special characters can be used to denote matching rules: + * - "%" means to match any substring with 0 or more characters. + * - "_" means to match any one character. + */ + optional string table_name_filter_pattern = 3; + + /* + * Specifies a filter of table types which must match. + * The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. + * TABLE, VIEW, and SYSTEM TABLE are commonly supported. + */ + repeated string table_types = 4; + + // Specifies if the Arrow schema should be returned for found tables. + bool include_schema = 5; +} + +/* + * Represents a request to retrieve the list of table types on a Flight SQL enabled backend. + * The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. + * TABLE, VIEW, and SYSTEM TABLE are commonly supported. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * table_type: utf8 not null + * > + * The returned data should be ordered by table_type. + */ +message CommandGetTableTypes { +} + +/* + * Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * catalog_name: utf8, + * db_schema_name: utf8, + * table_name: utf8 not null, + * column_name: utf8 not null, + * key_name: utf8, + * key_sequence: int32 not null + * > + * The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. + */ +message CommandGetPrimaryKeys { + + /* + * Specifies the catalog to search for the table. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string catalog = 1; + + /* + * Specifies the schema to search for the table. + * An empty string retrieves those without a schema. + * If omitted the schema name should not be used to narrow the search. + */ + optional string db_schema = 2; + + // Specifies the table to get the primary keys for. + string table = 3; +} + +enum UpdateDeleteRules { + CASCADE = 0; + RESTRICT = 1; + SET_NULL = 2; + NO_ACTION = 3; + SET_DEFAULT = 4; +} + +/* + * Represents a request to retrieve a description of the foreign key columns that reference the given table's + * primary key columns (the foreign keys exported by a table) of a table on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * pk_catalog_name: utf8, + * pk_db_schema_name: utf8, + * pk_table_name: utf8 not null, + * pk_column_name: utf8 not null, + * fk_catalog_name: utf8, + * fk_db_schema_name: utf8, + * fk_table_name: utf8 not null, + * fk_column_name: utf8 not null, + * key_sequence: int32 not null, + * fk_key_name: utf8, + * pk_key_name: utf8, + * update_rule: uint8 not null, + * delete_rule: uint8 not null + * > + * The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. + * update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. + */ +message CommandGetExportedKeys { + + /* + * Specifies the catalog to search for the foreign key table. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string catalog = 1; + + /* + * Specifies the schema to search for the foreign key table. + * An empty string retrieves those without a schema. + * If omitted the schema name should not be used to narrow the search. + */ + optional string db_schema = 2; + + // Specifies the foreign key table to get the foreign keys for. + string table = 3; +} + +/* + * Represents a request to retrieve the foreign keys of a table on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * pk_catalog_name: utf8, + * pk_db_schema_name: utf8, + * pk_table_name: utf8 not null, + * pk_column_name: utf8 not null, + * fk_catalog_name: utf8, + * fk_db_schema_name: utf8, + * fk_table_name: utf8 not null, + * fk_column_name: utf8 not null, + * key_sequence: int32 not null, + * fk_key_name: utf8, + * pk_key_name: utf8, + * update_rule: uint8 not null, + * delete_rule: uint8 not null + * > + * The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. + * update_rule and delete_rule returns a byte that is equivalent to actions: + * - 0 = CASCADE + * - 1 = RESTRICT + * - 2 = SET NULL + * - 3 = NO ACTION + * - 4 = SET DEFAULT + */ +message CommandGetImportedKeys { + + /* + * Specifies the catalog to search for the primary key table. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string catalog = 1; + + /* + * Specifies the schema to search for the primary key table. + * An empty string retrieves those without a schema. + * If omitted the schema name should not be used to narrow the search. + */ + optional string db_schema = 2; + + // Specifies the primary key table to get the foreign keys for. + string table = 3; +} + +/* + * Represents a request to retrieve a description of the foreign key columns in the given foreign key table that + * reference the primary key or the columns representing a unique constraint of the parent table (could be the same + * or a different table) on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * pk_catalog_name: utf8, + * pk_db_schema_name: utf8, + * pk_table_name: utf8 not null, + * pk_column_name: utf8 not null, + * fk_catalog_name: utf8, + * fk_db_schema_name: utf8, + * fk_table_name: utf8 not null, + * fk_column_name: utf8 not null, + * key_sequence: int32 not null, + * fk_key_name: utf8, + * pk_key_name: utf8, + * update_rule: uint8 not null, + * delete_rule: uint8 not null + * > + * The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. + * update_rule and delete_rule returns a byte that is equivalent to actions: + * - 0 = CASCADE + * - 1 = RESTRICT + * - 2 = SET NULL + * - 3 = NO ACTION + * - 4 = SET DEFAULT + */ +message CommandGetCrossReference { + + /** + * The catalog name where the parent table is. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string pk_catalog = 1; + + /** + * The Schema name where the parent table is. + * An empty string retrieves those without a schema. + * If omitted the schema name should not be used to narrow the search. + */ + optional string pk_db_schema = 2; + + /** + * The parent table name. It cannot be null. + */ + string pk_table = 3; + + /** + * The catalog name where the foreign table is. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string fk_catalog = 4; + + /** + * The schema name where the foreign table is. + * An empty string retrieves those without a schema. + * If omitted the schema name should not be used to narrow the search. + */ + optional string fk_db_schema = 5; + + /** + * The foreign table name. It cannot be null. + */ + string fk_table = 6; +} + +// Query Execution Action Messages + +/* + * Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. + */ +message ActionCreatePreparedStatementRequest { + + // The valid SQL string to create a prepared statement for. + string query = 1; + // Create/execute the prepared statement as part of this transaction (if + // unset, executions of the prepared statement will be auto-committed). + optional bytes transaction_id = 2; +} + +/* + * An embedded message describing a Substrait plan to execute. + */ +message SubstraitPlan { + + // The serialized substrait.Plan to create a prepared statement for. + // XXX(ARROW-16902): this is bytes instead of an embedded message + // because Protobuf does not really support one DLL using Protobuf + // definitions from another DLL. + bytes plan = 1; + // The Substrait release, e.g. "0.12.0". This information is not + // tracked in the plan itself, so this is the only way for consumers + // to potentially know if they can handle the plan. + string version = 2; +} + +/* + * Request message for the "CreatePreparedSubstraitPlan" action on a Flight SQL enabled backend. + */ +message ActionCreatePreparedSubstraitPlanRequest { + + // The serialized substrait.Plan to create a prepared statement for. + SubstraitPlan plan = 1; + // Create/execute the prepared statement as part of this transaction (if + // unset, executions of the prepared statement will be auto-committed). + optional bytes transaction_id = 2; +} + +/* + * Wrap the result of a "CreatePreparedStatement" or "CreatePreparedSubstraitPlan" action. + * + * The resultant PreparedStatement can be closed either: + * - Manually, through the "ClosePreparedStatement" action; + * - Automatically, by a server timeout. + * + * The result should be wrapped in a google.protobuf.Any message. + */ +message ActionCreatePreparedStatementResult { + + // Opaque handle for the prepared statement on the server. + bytes prepared_statement_handle = 1; + + // If a result set generating query was provided, dataset_schema contains the + // schema of the result set. It should be an IPC-encapsulated Schema, as described in Schema.fbs. + // For some queries, the schema of the results may depend on the schema of the parameters. The server + // should provide its best guess as to the schema at this point. Clients must not assume that this + // schema, if provided, will be accurate. + bytes dataset_schema = 2; + + // If the query provided contained parameters, parameter_schema contains the + // schema of the expected parameters. It should be an IPC-encapsulated Schema, as described in Schema.fbs. + bytes parameter_schema = 3; +} + +/* + * Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. + * Closes server resources associated with the prepared statement handle. + */ +message ActionClosePreparedStatementRequest { + + // Opaque handle for the prepared statement on the server. + bytes prepared_statement_handle = 1; +} + +/* + * Request message for the "BeginTransaction" action. + * Begins a transaction. + */ +message ActionBeginTransactionRequest { +} + +/* + * Request message for the "BeginSavepoint" action. + * Creates a savepoint within a transaction. + * + * Only supported if FLIGHT_SQL_TRANSACTION is + * FLIGHT_SQL_TRANSACTION_SUPPORT_SAVEPOINT. + */ +message ActionBeginSavepointRequest { + + // The transaction to which a savepoint belongs. + bytes transaction_id = 1; + // Name for the savepoint. + string name = 2; +} + +/* + * The result of a "BeginTransaction" action. + * + * The transaction can be manipulated with the "EndTransaction" action, or + * automatically via server timeout. If the transaction times out, then it is + * automatically rolled back. + * + * The result should be wrapped in a google.protobuf.Any message. + */ +message ActionBeginTransactionResult { + + // Opaque handle for the transaction on the server. + bytes transaction_id = 1; +} + +/* + * The result of a "BeginSavepoint" action. + * + * The transaction can be manipulated with the "EndSavepoint" action. + * If the associated transaction is committed, rolled back, or times + * out, then the savepoint is also invalidated. + * + * The result should be wrapped in a google.protobuf.Any message. + */ +message ActionBeginSavepointResult { + + // Opaque handle for the savepoint on the server. + bytes savepoint_id = 1; +} + +/* + * Request message for the "EndTransaction" action. + * + * Commit (COMMIT) or rollback (ROLLBACK) the transaction. + * + * If the action completes successfully, the transaction handle is + * invalidated, as are all associated savepoints. + */ +message ActionEndTransactionRequest { + + enum EndTransaction { + END_TRANSACTION_UNSPECIFIED = 0; + // Commit the transaction. + END_TRANSACTION_COMMIT = 1; + // Roll back the transaction. + END_TRANSACTION_ROLLBACK = 2; + } + // Opaque handle for the transaction on the server. + bytes transaction_id = 1; + // Whether to commit/rollback the given transaction. + EndTransaction action = 2; +} + +/* + * Request message for the "EndSavepoint" action. + * + * Release (RELEASE) the savepoint or rollback (ROLLBACK) to the + * savepoint. + * + * Releasing a savepoint invalidates that savepoint. Rolling back to + * a savepoint does not invalidate the savepoint, but invalidates all + * savepoints created after the current savepoint. + */ +message ActionEndSavepointRequest { + + enum EndSavepoint { + END_SAVEPOINT_UNSPECIFIED = 0; + // Release the savepoint. + END_SAVEPOINT_RELEASE = 1; + // Roll back to a savepoint. + END_SAVEPOINT_ROLLBACK = 2; + } + // Opaque handle for the savepoint on the server. + bytes savepoint_id = 1; + // Whether to rollback/release the given savepoint. + EndSavepoint action = 2; +} + +// Query Execution Messages. + +/* + * Represents a SQL query. Used in the command member of FlightDescriptor + * for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * Fields on this schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. + * - GetFlightInfo: execute the query. + */ +message CommandStatementQuery { + + // The SQL syntax. + string query = 1; + // Include the query as part of this transaction (if unset, the query is auto-committed). + optional bytes transaction_id = 2; +} + +/* + * Represents a Substrait plan. Used in the command member of FlightDescriptor + * for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * Fields on this schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. + * - GetFlightInfo: execute the query. + * - DoPut: execute the query. + */ +message CommandStatementSubstraitPlan { + + // A serialized substrait.Plan + SubstraitPlan plan = 1; + // Include the query as part of this transaction (if unset, the query is auto-committed). + optional bytes transaction_id = 2; +} + +/** + * Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. + * This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. + */ +message TicketStatementQuery { + + // Unique identifier for the instance of the statement to execute. + bytes statement_handle = 1; +} + +/* + * Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for + * the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * Fields on this schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. + * + * If the schema is retrieved after parameter values have been bound with DoPut, then the server should account + * for the parameters when determining the schema. + * - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. + * - GetFlightInfo: execute the prepared statement instance. + */ +message CommandPreparedStatementQuery { + + // Opaque handle for the prepared statement on the server. + bytes prepared_statement_handle = 1; +} + +/* + * Represents a SQL update query. Used in the command member of FlightDescriptor + * for the RPC call DoPut to cause the server to execute the included SQL update. + */ +message CommandStatementUpdate { + + // The SQL syntax. + string query = 1; + // Include the query as part of this transaction (if unset, the query is auto-committed). + optional bytes transaction_id = 2; +} + +/* + * Represents a SQL update query. Used in the command member of FlightDescriptor + * for the RPC call DoPut to cause the server to execute the included + * prepared statement handle as an update. + */ +message CommandPreparedStatementUpdate { + + // Opaque handle for the prepared statement on the server. + bytes prepared_statement_handle = 1; +} + +/* + * Represents a bulk ingestion request. Used in the command member of FlightDescriptor + * for the the RPC call DoPut to cause the server load the contents of the stream's + * FlightData into the target destination. + */ +message CommandStatementIngest { + + // Options for table definition behavior + message TableDefinitionOptions { + // The action to take if the target table does not exist + enum TableNotExistOption { + // Do not use. Servers should error if this is specified by a client. + TABLE_NOT_EXIST_OPTION_UNSPECIFIED = 0; + // Create the table if it does not exist + TABLE_NOT_EXIST_OPTION_CREATE = 1; + // Fail if the table does not exist + TABLE_NOT_EXIST_OPTION_FAIL = 2; + } + // The action to take if the target table already exists + enum TableExistsOption { + // Do not use. Servers should error if this is specified by a client. + TABLE_EXISTS_OPTION_UNSPECIFIED = 0; + // Fail if the table already exists + TABLE_EXISTS_OPTION_FAIL = 1; + // Append to the table if it already exists + TABLE_EXISTS_OPTION_APPEND = 2; + // Drop and recreate the table if it already exists + TABLE_EXISTS_OPTION_REPLACE = 3; + } + + TableNotExistOption if_not_exist = 1; + TableExistsOption if_exists = 2; + } + + // The behavior for handling the table definition. + TableDefinitionOptions table_definition_options = 1; + // The table to load data into. + string table = 2; + // The db_schema of the destination table to load data into. If unset, a backend-specific default may be used. + optional string schema = 3; + // The catalog of the destination table to load data into. If unset, a backend-specific default may be used. + optional string catalog = 4; + /* + * Store ingested data in a temporary table. + * The effect of setting temporary is to place the table in a backend-defined namespace, and to drop the table at the end of the session. + * The namespacing may make use of a backend-specific schema and/or catalog. + * The server should return an error if an explicit choice of schema or catalog is incompatible with the server's namespacing decision. + */ + bool temporary = 5; + // Perform the ingestion as part of this transaction. If specified, results should not be committed in the event of an error/cancellation. + optional bytes transaction_id = 6; + + // Future extensions to the parameters of CommandStatementIngest should be added here, at a lower index than the generic 'options' parameter. + + // Backend-specific options. + map options = 1000; +} + +/* + * Returned from the RPC call DoPut when a CommandStatementUpdate, + * CommandPreparedStatementUpdate, or CommandStatementIngest was + * in the request, containing results from the update. + */ +message DoPutUpdateResult { + + // The number of records updated. A return value of -1 represents + // an unknown updated record count. + int64 record_count = 1; +} + +/* An *optional* response returned when `DoPut` is called with `CommandPreparedStatementQuery`. + * + * *Note on legacy behavior*: previous versions of the protocol did not return any result for + * this command, and that behavior should still be supported by clients. In that case, the client + * can continue as though the fields in this message were not provided or set to sensible default values. + */ +message DoPutPreparedStatementResult { + + // Represents a (potentially updated) opaque handle for the prepared statement on the server. + // Because the handle could potentially be updated, any previous handles for this prepared + // statement should be considered invalid, and all subsequent requests for this prepared + // statement must use this new handle. + // The updated handle allows implementing query parameters with stateless services. + // + // When an updated handle is not provided by the server, clients should contiue + // using the previous handle provided by `ActionCreatePreparedStatementResonse`. + optional bytes prepared_statement_handle = 1; +} + +/* + * Request message for the "CancelQuery" action. + * + * Explicitly cancel a running query. + * + * This lets a single client explicitly cancel work, no matter how many clients + * are involved/whether the query is distributed or not, given server support. + * The transaction/statement is not rolled back; it is the application's job to + * commit or rollback as appropriate. This only indicates the client no longer + * wishes to read the remainder of the query results or continue submitting + * data. + * + * This command is idempotent. + * + * This command is deprecated since 13.0.0. Use the "CancelFlightInfo" + * action with DoAction instead. + */ +message ActionCancelQueryRequest { + option deprecated = true; + + // The result of the GetFlightInfo RPC that initiated the query. + // XXX(ARROW-16902): this must be a serialized FlightInfo, but is + // rendered as bytes because Protobuf does not really support one + // DLL using Protobuf definitions from another DLL. + bytes info = 1; +} + +/* + * The result of cancelling a query. + * + * The result should be wrapped in a google.protobuf.Any message. + * + * This command is deprecated since 13.0.0. Use the "CancelFlightInfo" + * action with DoAction instead. + */ +message ActionCancelQueryResult { + option deprecated = true; + + enum CancelResult { + // The cancellation status is unknown. Servers should avoid using + // this value (send a NOT_FOUND error if the requested query is + // not known). Clients can retry the request. + CANCEL_RESULT_UNSPECIFIED = 0; + // The cancellation request is complete. Subsequent requests with + // the same payload may return CANCELLED or a NOT_FOUND error. + CANCEL_RESULT_CANCELLED = 1; + // The cancellation request is in progress. The client may retry + // the cancellation request. + CANCEL_RESULT_CANCELLING = 2; + // The query is not cancellable. The client should not retry the + // cancellation request. + CANCEL_RESULT_NOT_CANCELLABLE = 3; + } + + CancelResult result = 1; +} + +extend google.protobuf.MessageOptions { + bool experimental = 1000; +} diff --git a/arrow-format/Message.fbs b/arrow-format/Message.fbs new file mode 100644 index 000000000..be57533d8 --- /dev/null +++ b/arrow-format/Message.fbs @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; +include "SparseTensor.fbs"; +include "Tensor.fbs"; + +namespace org.apache.arrow.flatbuf; + +/// ---------------------------------------------------------------------- +/// Data structures for describing a table row batch (a collection of +/// equal-length Arrow arrays) + +/// Metadata about a field at some level of a nested type tree (but not +/// its children). +/// +/// For example, a List with values `[[1, 2, 3], null, [4], [5, 6], null]` +/// would have {length: 5, null_count: 2} for its List node, and {length: 6, +/// null_count: 0} for its Int16 node, as separate FieldNode structs +struct FieldNode { + /// The number of value slots in the Arrow array at this level of a nested + /// tree + length: long; + + /// The number of observed nulls. Fields with null_count == 0 may choose not + /// to write their physical validity bitmap out as a materialized buffer, + /// instead setting the length of the bitmap buffer to 0. + null_count: long; +} + +enum CompressionType:byte { + // LZ4 frame format, for portability, as provided by lz4frame.h or wrappers + // thereof. Not to be confused with "raw" (also called "block") format + // provided by lz4.h + LZ4_FRAME, + + // Zstandard + ZSTD +} + +/// Provided for forward compatibility in case we need to support different +/// strategies for compressing the IPC message body (like whole-body +/// compression rather than buffer-level) in the future +enum BodyCompressionMethod:byte { + /// Each constituent buffer is first compressed with the indicated + /// compressor, and then written with the uncompressed length in the first 8 + /// bytes as a 64-bit little-endian signed integer followed by the compressed + /// buffer bytes (and then padding as required by the protocol). The + /// uncompressed length may be set to -1 to indicate that the data that + /// follows is not compressed, which can be useful for cases where + /// compression does not yield appreciable savings. + BUFFER +} + +/// Optional compression for the memory buffers constituting IPC message +/// bodies. Intended for use with RecordBatch but could be used for other +/// message types +table BodyCompression { + /// Compressor library. + /// For LZ4_FRAME, each compressed buffer must consist of a single frame. + codec: CompressionType = LZ4_FRAME; + + /// Indicates the way the record batch body was compressed + method: BodyCompressionMethod = BUFFER; +} + +/// A data header describing the shared memory layout of a "record" or "row" +/// batch. Some systems call this a "row batch" internally and others a "record +/// batch". +table RecordBatch { + /// number of records / rows. The arrays in the batch should all have this + /// length + length: long; + + /// Nodes correspond to the pre-ordered flattened logical schema + nodes: [FieldNode]; + + /// Buffers correspond to the pre-ordered flattened buffer tree + /// + /// The number of buffers appended to this list depends on the schema. For + /// example, most primitive arrays will have 2 buffers, 1 for the validity + /// bitmap and 1 for the values. For struct arrays, there will only be a + /// single buffer for the validity (nulls) bitmap + buffers: [Buffer]; + + /// Optional compression of the message body + compression: BodyCompression; + + /// Some types such as Utf8View are represented using a variable number of buffers. + /// For each such Field in the pre-ordered flattened logical schema, there will be + /// an entry in variadicBufferCounts to indicate the number of number of variadic + /// buffers which belong to that Field in the current RecordBatch. + /// + /// For example, the schema + /// col1: Struct + /// col2: Utf8View + /// contains two Fields with variadic buffers so variadicBufferCounts will have + /// two entries, the first counting the variadic buffers of `col1.beta` and the + /// second counting `col2`'s. + /// + /// This field may be omitted if and only if the schema contains no Fields with + /// a variable number of buffers, such as BinaryView and Utf8View. + variadicBufferCounts: [long]; +} + +/// For sending dictionary encoding information. Any Field can be +/// dictionary-encoded, but in this case none of its children may be +/// dictionary-encoded. +/// There is one vector / column per dictionary, but that vector / column +/// may be spread across multiple dictionary batches by using the isDelta +/// flag + +table DictionaryBatch { + id: long; + data: RecordBatch; + + /// If isDelta is true the values in the dictionary are to be appended to a + /// dictionary with the indicated id. If isDelta is false this dictionary + /// should replace the existing dictionary. + isDelta: bool = false; +} + +/// ---------------------------------------------------------------------- +/// The root Message type + +/// This union enables us to easily send different message types without +/// redundant storage, and in the future we can easily add new message types. +/// +/// Arrow implementations do not need to implement all of the message types, +/// which may include experimental metadata types. For maximum compatibility, +/// it is best to send data using RecordBatch +union MessageHeader { + Schema, DictionaryBatch, RecordBatch, Tensor, SparseTensor +} + +table Message { + version: org.apache.arrow.flatbuf.MetadataVersion; + header: MessageHeader; + bodyLength: long; + custom_metadata: [ KeyValue ]; +} + +root_type Message; diff --git a/arrow-format/README.rst b/arrow-format/README.rst new file mode 100644 index 000000000..0eaad49b7 --- /dev/null +++ b/arrow-format/README.rst @@ -0,0 +1,25 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Arrow Protocol Files +==================== + +This folder contains binary protocol definitions for the Arrow columnar format +and other parts of the project, like the Flight RPC framework. + +For documentation about the Arrow format, see the `docs/source/format` +directory. diff --git a/arrow-format/Schema.fbs b/arrow-format/Schema.fbs new file mode 100644 index 000000000..e8e14b112 --- /dev/null +++ b/arrow-format/Schema.fbs @@ -0,0 +1,571 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Logical types, vector layouts, and schemas + +/// Format Version History. +/// Version 1.0 - Forward and backwards compatibility guaranteed. +/// Version 1.1 - Add Decimal256. +/// Version 1.2 - Add Interval MONTH_DAY_NANO. +/// Version 1.3 - Add Run-End Encoded. +/// Version 1.4 - Add BinaryView, Utf8View, variadicBufferCounts, ListView, and +/// LargeListView. +/// Version 1.5 - Add 32-bit and 64-bit as allowed bit widths for Decimal + +namespace org.apache.arrow.flatbuf; + +enum MetadataVersion:short { + /// 0.1.0 (October 2016). + V1, + + /// 0.2.0 (February 2017). Non-backwards compatible with V1. + V2, + + /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. + V3, + + /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. + V4, + + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 + /// metadata and IPC messages). Implementations are recommended to provide a + /// V4 compatibility mode with V5 format changes disabled. + /// + /// Incompatible changes between V4 and V5: + /// - Union buffer layout has changed. In V5, Unions don't have a validity + /// bitmap buffer. + V5, +} + +/// Represents Arrow Features that might not have full support +/// within implementations. This is intended to be used in +/// two scenarios: +/// 1. A mechanism for readers of Arrow Streams +/// and files to understand that the stream or file makes +/// use of a feature that isn't supported or unknown to +/// the implementation (and therefore can meet the Arrow +/// forward compatibility guarantees). +/// 2. A means of negotiating between a client and server +/// what features a stream is allowed to use. The enums +/// values here are intented to represent higher level +/// features, additional details maybe negotiated +/// with key-value pairs specific to the protocol. +/// +/// Enums added to this list should be assigned power-of-two values +/// to facilitate exchanging and comparing bitmaps for supported +/// features. +enum Feature : long { + /// Needed to make flatbuffers happy. + UNUSED = 0, + /// The stream makes use of multiple full dictionaries with the + /// same ID and assumes clients implement dictionary replacement + /// correctly. + DICTIONARY_REPLACEMENT = 1, + /// The stream makes use of compressed bodies as described + /// in Message.fbs. + COMPRESSED_BODY = 2 +} + +/// These are stored in the flatbuffer in the Type union below + +table Null { +} + +/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct +/// (according to the physical memory layout). We used Struct_ here as +/// Struct is a reserved word in Flatbuffers +table Struct_ { +} + +table List { +} + +/// Same as List, but with 64-bit offsets, allowing to represent +/// extremely large data values. +table LargeList { +} + +/// Represents the same logical types that List can, but contains offsets and +/// sizes allowing for writes in any order and sharing of child values among +/// list values. +table ListView { +} + +/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent +/// extremely large data values. +table LargeListView { +} + +table FixedSizeList { + /// Number of list items per value + listSize: int; +} + +/// A Map is a logical nested type that is represented as +/// +/// List> +/// +/// In this layout, the keys and values are each respectively contiguous. We do +/// not constrain the key and value types, so the application is responsible +/// for ensuring that the keys are hashable and unique. Whether the keys are sorted +/// may be set in the metadata for this field. +/// +/// In a field with Map type, the field has a child Struct field, which then +/// has two children: key type and the second the value type. The names of the +/// child fields may be respectively "entries", "key", and "value", but this is +/// not enforced. +/// +/// Map +/// ```text +/// - child[0] entries: Struct +/// - child[0] key: K +/// - child[1] value: V +/// ``` +/// Neither the "entries" field nor the "key" field may be nullable. +/// +/// The metadata is structured so that Arrow systems without special handling +/// for Map can make Map an alias for List. The "layout" attribute for the Map +/// field must have the same contents as a List. +table Map { + /// Set to true if the keys within each value are sorted + keysSorted: bool; +} + +enum UnionMode:short { Sparse, Dense } + +/// A union is a complex type with children in Field +/// By default ids in the type vector refer to the offsets in the children +/// optionally typeIds provides an indirection between the child offset and the type id +/// for each child `typeIds[offset]` is the id used in the type vector +table Union { + mode: UnionMode; + typeIds: [ int ]; // optional, describes typeid of each child. +} + +table Int { + bitWidth: int; // restricted to 8, 16, 32, and 64 in v1 + is_signed: bool; +} + +enum Precision:short {HALF, SINGLE, DOUBLE} + +table FloatingPoint { + precision: Precision; +} + +/// Unicode with UTF-8 encoding +table Utf8 { +} + +/// Opaque binary data +table Binary { +} + +/// Same as Utf8, but with 64-bit offsets, allowing to represent +/// extremely large data values. +table LargeUtf8 { +} + +/// Same as Binary, but with 64-bit offsets, allowing to represent +/// extremely large data values. +table LargeBinary { +} + +/// Logically the same as Utf8, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +table Utf8View { +} + +/// Logically the same as Binary, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +table BinaryView { +} + + +table FixedSizeBinary { + /// Number of bytes per value + byteWidth: int; +} + +table Bool { +} + +/// Contains two child arrays, run_ends and values. +/// The run_ends child array must be a 16/32/64-bit integer array +/// which encodes the indices at which the run with the value in +/// each corresponding index in the values child array ends. +/// Like list/struct types, the value array can be of any type. +table RunEndEncoded { +} + +/// Exact decimal value represented as an integer value in two's +/// complement. Currently 32-bit (4-byte), 64-bit (8-byte), +/// 128-bit (16-byte) and 256-bit (32-byte) integers are used. +/// The representation uses the endianness indicated in the Schema. +table Decimal { + /// Total number of decimal digits + precision: int; + + /// Number of digits after the decimal point "." + scale: int; + + /// Number of bits per value. The accepted widths are 32, 64, 128 and 256. + /// We use bitWidth for consistency with Int::bitWidth. + bitWidth: int = 128; +} + +enum DateUnit: short { + DAY, + MILLISECOND +} + +/// Date is either a 32-bit or 64-bit signed integer type representing an +/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units: +/// +/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no +/// leap seconds), where the values are evenly divisible by 86400000 +/// * Days (32 bits) since the UNIX epoch +table Date { + unit: DateUnit = MILLISECOND; +} + +enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND } + +/// Time is either a 32-bit or 64-bit signed integer type representing an +/// elapsed time since midnight, stored in either of four units: seconds, +/// milliseconds, microseconds or nanoseconds. +/// +/// The integer `bitWidth` depends on the `unit` and must be one of the following: +/// * SECOND and MILLISECOND: 32 bits +/// * MICROSECOND and NANOSECOND: 64 bits +/// +/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds +/// (exclusive), adjusted for the time unit (for example, up to 86400000 +/// exclusive for the MILLISECOND unit). +/// This definition doesn't allow for leap seconds. Time values from +/// measurements with leap seconds will need to be corrected when ingesting +/// into Arrow (for example by replacing the value 86400 with 86399). +table Time { + unit: TimeUnit = MILLISECOND; + bitWidth: int = 32; +} + +/// Timestamp is a 64-bit signed integer representing an elapsed time since a +/// fixed epoch, stored in either of four units: seconds, milliseconds, +/// microseconds or nanoseconds, and is optionally annotated with a timezone. +/// +/// Timestamp values do not include any leap seconds (in other words, all +/// days are considered 86400 seconds long). +/// +/// Timestamps with a non-empty timezone +/// ------------------------------------ +/// +/// If a Timestamp column has a non-empty timezone value, its epoch is +/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone +/// (the Unix epoch), regardless of the Timestamp's own timezone. +/// +/// Therefore, timestamp values with a non-empty timezone correspond to +/// physical points in time together with some additional information about +/// how the data was obtained and/or how to display it (the timezone). +/// +/// For example, the timestamp value 0 with the timezone string "Europe/Paris" +/// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the +/// application may prefer to display it as "January 1st 1970, 01h00" in +/// the Europe/Paris timezone (which is the same physical point in time). +/// +/// One consequence is that timestamp values with a non-empty timezone +/// can be compared and ordered directly, since they all share the same +/// well-known point of reference (the Unix epoch). +/// +/// Timestamps with an unset / empty timezone +/// ----------------------------------------- +/// +/// If a Timestamp column has no timezone value, its epoch is +/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. +/// +/// Therefore, timestamp values without a timezone cannot be meaningfully +/// interpreted as physical points in time, but only as calendar / clock +/// indications ("wall clock time") in an unspecified timezone. +/// +/// For example, the timestamp value 0 with an empty timezone string +/// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there +/// is not enough information to interpret it as a well-defined physical +/// point in time. +/// +/// One consequence is that timestamp values without a timezone cannot +/// be reliably compared or ordered, since they may have different points of +/// reference. In particular, it is *not* possible to interpret an unset +/// or empty timezone as the same as "UTC". +/// +/// Conversion between timezones +/// ---------------------------- +/// +/// If a Timestamp column has a non-empty timezone, changing the timezone +/// to a different non-empty value is a metadata-only operation: +/// the timestamp values need not change as their point of reference remains +/// the same (the Unix epoch). +/// +/// However, if a Timestamp column has no timezone value, changing it to a +/// non-empty value requires to think about the desired semantics. +/// One possibility is to assume that the original timestamp values are +/// relative to the epoch of the timezone being set; timestamp values should +/// then adjusted to the Unix epoch (for example, changing the timezone from +/// empty to "Europe/Paris" would require converting the timestamp values +/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is +/// nevertheless correct). +/// +/// Guidelines for encoding data from external libraries +/// ---------------------------------------------------- +/// +/// Date & time libraries often have multiple different data types for temporal +/// data. In order to ease interoperability between different implementations the +/// Arrow project has some recommendations for encoding these types into a Timestamp +/// column. +/// +/// An "instant" represents a physical point in time that has no relevant timezone +/// (for example, astronomical data). To encode an instant, use a Timestamp with +/// the timezone string set to "UTC", and make sure the Timestamp values +/// are relative to the UTC epoch (January 1st 1970, midnight). +/// +/// A "zoned date-time" represents a physical point in time annotated with an +/// informative timezone (for example, the timezone in which the data was +/// recorded). To encode a zoned date-time, use a Timestamp with the timezone +/// string set to the name of the timezone, and make sure the Timestamp values +/// are relative to the UTC epoch (January 1st 1970, midnight). +/// +/// (There is some ambiguity between an instant and a zoned date-time with the +/// UTC timezone. Both of these are stored the same in Arrow. Typically, +/// this distinction does not matter. If it does, then an application should +/// use custom metadata or an extension type to distinguish between the two cases.) +/// +/// An "offset date-time" represents a physical point in time combined with an +/// explicit offset from UTC. To encode an offset date-time, use a Timestamp +/// with the timezone string set to the numeric timezone offset string +/// (e.g. "+03:00"), and make sure the Timestamp values are relative to +/// the UTC epoch (January 1st 1970, midnight). +/// +/// A "naive date-time" (also called "local date-time" in some libraries) +/// represents a wall clock time combined with a calendar date, but with +/// no indication of how to map this information to a physical point in time. +/// Naive date-times must be handled with care because of this missing +/// information, and also because daylight saving time (DST) may make +/// some values ambiguous or nonexistent. A naive date-time may be +/// stored as a struct with Date and Time fields. However, it may also be +/// encoded into a Timestamp column with an empty timezone. The timestamp +/// values should be computed "as if" the timezone of the date-time values +/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would +/// be encoded as timestamp value 0. +table Timestamp { + unit: TimeUnit; + + /// The timezone is an optional string indicating the name of a timezone, + /// one of: + /// + /// * As used in the Olson timezone database (the "tz database" or + /// "tzdata"), such as "America/New_York". + /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX", + /// such as "+07:30". + /// + /// Whether a timezone string is present indicates different semantics about + /// the data (see above). + timezone: string; +} + +enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO} +// A "calendar" interval which models types that don't necessarily +// have a precise duration without the context of a base timestamp (e.g. +// days can differ in length during day light savings time transitions). +// All integers in the types below are stored in the endianness indicated +// by the schema. +// +// YEAR_MONTH - Indicates the number of elapsed whole months, stored as +// 4-byte signed integers. +// DAY_TIME - Indicates the number of elapsed days and milliseconds (no leap seconds), +// stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support +// of this IntervalUnit is not required for full arrow compatibility. +// MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds. +// The values are stored contiguously in 16-byte blocks. Months and days are +// encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit +// signed integer. Nanoseconds does not allow for leap seconds. Each field is +// independent (e.g. there is no constraint that nanoseconds have the same +// sign as days or that the quantity of nanoseconds represents less than a +// day's worth of time). +table Interval { + unit: IntervalUnit; +} + +// An absolute length of time unrelated to any calendar artifacts. +// +// For the purposes of Arrow Implementations, adding this value to a Timestamp +// ("t1") naively (i.e. simply summing the two number) is acceptable even +// though in some cases the resulting Timestamp (t2) would not account for +// leap-seconds during the elapsed time between "t1" and "t2". Similarly, +// representing the difference between two Unix timestamp is acceptable, but +// would yield a value that is possibly a few seconds off from the true elapsed +// time. +// +// The resolution defaults to millisecond, but can be any of the other +// supported TimeUnit values as with Timestamp and Time types. This type is +// always represented as an 8-byte integer. +table Duration { + unit: TimeUnit = MILLISECOND; +} + +/// ---------------------------------------------------------------------- +/// Top-level Type value, enabling extensible type-specific metadata. We can +/// add new logical types to Type without breaking backwards compatibility + +union Type { + Null, + Int, + FloatingPoint, + Binary, + Utf8, + Bool, + Decimal, + Date, + Time, + Timestamp, + Interval, + List, + Struct_, + Union, + FixedSizeBinary, + FixedSizeList, + Map, + Duration, + LargeBinary, + LargeUtf8, + LargeList, + RunEndEncoded, + BinaryView, + Utf8View, + ListView, + LargeListView, +} + +/// ---------------------------------------------------------------------- +/// user defined key value pairs to add custom metadata to arrow +/// key namespacing is the responsibility of the user + +table KeyValue { + key: string; + value: string; +} + +/// ---------------------------------------------------------------------- +/// Dictionary encoding metadata +/// Maintained for forwards compatibility, in the future +/// Dictionaries might be explicit maps between integers and values +/// allowing for non-contiguous index values +enum DictionaryKind : short { DenseArray } +table DictionaryEncoding { + /// The known dictionary id in the application where this data is used. In + /// the file or streaming formats, the dictionary ids are found in the + /// DictionaryBatch messages + id: long; + + /// The dictionary indices are constrained to be non-negative integers. If + /// this field is null, the indices must be signed int32. To maximize + /// cross-language compatibility and performance, implementations are + /// recommended to prefer signed integer types over unsigned integer types + /// and to avoid uint64 indices unless they are required by an application. + indexType: Int; + + /// By default, dictionaries are not ordered, or the order does not have + /// semantic meaning. In some statistical, applications, dictionary-encoding + /// is used to represent ordered categorical data, and we provide a way to + /// preserve that metadata here + isOrdered: bool; + + dictionaryKind: DictionaryKind; +} + +/// ---------------------------------------------------------------------- +/// A field represents a named column in a record / row batch or child of a +/// nested type. + +table Field { + /// Name is not required, in i.e. a List + name: string; + + /// Whether or not this field can contain nulls. Should be true in general. + nullable: bool; + + /// This is the type of the decoded value if the field is dictionary encoded. + type: Type; + + /// Present only if the field is dictionary encoded. + dictionary: DictionaryEncoding; + + /// children apply only to nested data types like Struct, List and Union. For + /// primitive types children will have length 0. + children: [ Field ]; + + /// User-defined metadata + custom_metadata: [ KeyValue ]; +} + +/// ---------------------------------------------------------------------- +/// Endianness of the platform producing the data + +enum Endianness:short { Little, Big } + +/// ---------------------------------------------------------------------- +/// A Buffer represents a single contiguous memory segment +struct Buffer { + /// The relative offset into the shared memory page where the bytes for this + /// buffer starts + offset: long; + + /// The absolute length (in bytes) of the memory buffer. The memory is found + /// from offset (inclusive) to offset + length (non-inclusive). When building + /// messages using the encapsulated IPC message, padding bytes may be written + /// after a buffer, but such padding bytes do not need to be accounted for in + /// the size here. + length: long; +} + +/// ---------------------------------------------------------------------- +/// A Schema describes the columns in a row batch + +table Schema { + + /// endianness of the buffer + /// it is Little Endian by default + /// if endianness doesn't match the underlying system then the vectors need to be converted + endianness: Endianness=Little; + + fields: [Field]; + // User-defined metadata + custom_metadata: [ KeyValue ]; + + /// Features used in the stream/file. + features : [ Feature ]; +} + +root_type Schema; diff --git a/arrow-format/SparseTensor.fbs b/arrow-format/SparseTensor.fbs new file mode 100644 index 000000000..a6fd2f9e7 --- /dev/null +++ b/arrow-format/SparseTensor.fbs @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// EXPERIMENTAL: Metadata for n-dimensional sparse arrays, aka "sparse tensors". +/// Arrow implementations in general are not required to implement this type + +include "Tensor.fbs"; + +namespace org.apache.arrow.flatbuf; + +/// ---------------------------------------------------------------------- +/// EXPERIMENTAL: Data structures for sparse tensors + +/// Coordinate (COO) format of sparse tensor index. +/// +/// COO's index list are represented as a NxM matrix, +/// where N is the number of non-zero values, +/// and M is the number of dimensions of a sparse tensor. +/// +/// indicesBuffer stores the location and size of the data of this indices +/// matrix. The value type and the stride of the indices matrix is +/// specified in indicesType and indicesStrides fields. +/// +/// For example, let X be a 2x3x4x5 tensor, and it has the following +/// 6 non-zero values: +/// ```text +/// X[0, 1, 2, 0] := 1 +/// X[1, 1, 2, 3] := 2 +/// X[0, 2, 1, 0] := 3 +/// X[0, 1, 3, 0] := 4 +/// X[0, 1, 2, 1] := 5 +/// X[1, 2, 0, 4] := 6 +/// ``` +/// In COO format, the index matrix of X is the following 4x6 matrix: +/// ```text +/// [[0, 0, 0, 0, 1, 1], +/// [1, 1, 1, 2, 1, 2], +/// [2, 2, 3, 1, 2, 0], +/// [0, 1, 0, 0, 3, 4]] +/// ``` +/// When isCanonical is true, the indices is sorted in lexicographical order +/// (row-major order), and it does not have duplicated entries. Otherwise, +/// the indices may not be sorted, or may have duplicated entries. +table SparseTensorIndexCOO { + /// The type of values in indicesBuffer + indicesType: Int (required); + + /// Non-negative byte offsets to advance one value cell along each dimension + /// If omitted, default to row-major order (C-like). + indicesStrides: [long]; + + /// The location and size of the indices matrix's data + indicesBuffer: Buffer (required); + + /// This flag is true if and only if the indices matrix is sorted in + /// row-major order, and does not have duplicated entries. + /// This sort order is the same as of Tensorflow's SparseTensor, + /// but it is inverse order of SciPy's canonical coo_matrix + /// (SciPy employs column-major order for its coo_matrix). + isCanonical: bool; +} + +enum SparseMatrixCompressedAxis: short { Row, Column } + +/// Compressed Sparse format, that is matrix-specific. +table SparseMatrixIndexCSX { + /// Which axis, row or column, is compressed + compressedAxis: SparseMatrixCompressedAxis; + + /// The type of values in indptrBuffer + indptrType: Int (required); + + /// indptrBuffer stores the location and size of indptr array that + /// represents the range of the rows. + /// The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + /// The length of this array is 1 + (the number of rows), and the type + /// of index value is long. + /// + /// For example, let X be the following 6x4 matrix: + /// ```text + /// X := [[0, 1, 2, 0], + /// [0, 0, 3, 0], + /// [0, 4, 0, 5], + /// [0, 0, 0, 0], + /// [6, 0, 7, 8], + /// [0, 9, 0, 0]]. + /// ``` + /// The array of non-zero values in X is: + /// ```text + /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. + /// ``` + /// And the indptr of X is: + /// ```text + /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. + /// ``` + indptrBuffer: Buffer (required); + + /// The type of values in indicesBuffer + indicesType: Int (required); + + /// indicesBuffer stores the location and size of the array that + /// contains the column indices of the corresponding non-zero values. + /// The type of index value is long. + /// + /// For example, the indices of the above X is: + /// ```text + /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. + /// ``` + /// Note that the indices are sorted in lexicographical order for each row. + indicesBuffer: Buffer (required); +} + +/// Compressed Sparse Fiber (CSF) sparse tensor index. +table SparseTensorIndexCSF { + /// CSF is a generalization of compressed sparse row (CSR) index. + /// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf) + /// + /// CSF index recursively compresses each dimension of a tensor into a set + /// of prefix trees. Each path from a root to leaf forms one tensor + /// non-zero index. CSF is implemented with two arrays of buffers and one + /// arrays of integers. + /// + /// For example, let X be a 2x3x4x5 tensor and let it have the following + /// 8 non-zero values: + /// ```text + /// X[0, 0, 0, 1] := 1 + /// X[0, 0, 0, 2] := 2 + /// X[0, 1, 0, 0] := 3 + /// X[0, 1, 0, 2] := 4 + /// X[0, 1, 1, 0] := 5 + /// X[1, 1, 1, 0] := 6 + /// X[1, 1, 1, 1] := 7 + /// X[1, 1, 1, 2] := 8 + /// ``` + /// As a prefix tree this would be represented as: + /// ```text + /// 0 1 + /// / \ | + /// 0 1 1 + /// / / \ | + /// 0 0 1 1 + /// /| /| | /| | + /// 1 2 0 2 0 0 1 2 + /// ``` + /// The type of values in indptrBuffers + indptrType: Int (required); + + /// indptrBuffers stores the sparsity structure. + /// Each two consecutive dimensions in a tensor correspond to a buffer in + /// indptrBuffers. A pair of consecutive values at `indptrBuffers[dim][i]` + /// and `indptrBuffers[dim][i + 1]` signify a range of nodes in + /// `indicesBuffers[dim + 1]` who are children of `indicesBuffers[dim][i]` node. + /// + /// For example, the indptrBuffers for the above X is: + /// ```text + /// indptrBuffer(X) = [ + /// [0, 2, 3], + /// [0, 1, 3, 4], + /// [0, 2, 4, 5, 8] + /// ]. + /// ``` + indptrBuffers: [Buffer] (required); + + /// The type of values in indicesBuffers + indicesType: Int (required); + + /// indicesBuffers stores values of nodes. + /// Each tensor dimension corresponds to a buffer in indicesBuffers. + /// For example, the indicesBuffers for the above X is: + /// ```text + /// indicesBuffer(X) = [ + /// [0, 1], + /// [0, 1, 1], + /// [0, 0, 1, 1], + /// [1, 2, 0, 2, 0, 0, 1, 2] + /// ]. + /// ``` + indicesBuffers: [Buffer] (required); + + /// axisOrder stores the sequence in which dimensions were traversed to + /// produce the prefix tree. + /// For example, the axisOrder for the above X is: + /// ```text + /// axisOrder(X) = [0, 1, 2, 3]. + /// ``` + axisOrder: [int] (required); +} + +union SparseTensorIndex { + SparseTensorIndexCOO, + SparseMatrixIndexCSX, + SparseTensorIndexCSF +} + +table SparseTensor { + /// The type of data contained in a value cell. + /// Currently only fixed-width value types are supported, + /// no strings or nested types. + type: Type (required); + + /// The dimensions of the tensor, optionally named. + shape: [TensorDim] (required); + + /// The number of non-zero values in a sparse tensor. + non_zero_length: long; + + /// Sparse tensor index + sparseIndex: SparseTensorIndex (required); + + /// The location and size of the tensor's data + data: Buffer (required); +} + +root_type SparseTensor; diff --git a/arrow-format/Tensor.fbs b/arrow-format/Tensor.fbs new file mode 100644 index 000000000..409297ccf --- /dev/null +++ b/arrow-format/Tensor.fbs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// EXPERIMENTAL: Metadata for n-dimensional arrays, aka "tensors" or +/// "ndarrays". Arrow implementations in general are not required to implement +/// this type + +include "Schema.fbs"; + +namespace org.apache.arrow.flatbuf; + +/// ---------------------------------------------------------------------- +/// Data structures for dense tensors + +/// Shape data for a single axis in a tensor +table TensorDim { + /// Length of dimension + size: long; + + /// Name of the dimension, optional + name: string; +} + +table Tensor { + /// The type of data contained in a value cell. Currently only fixed-width + /// value types are supported, no strings or nested types + type: Type (required); + + /// The dimensions of the tensor, optionally named + shape: [TensorDim] (required); + + /// Non-negative byte offsets to advance one value cell along each dimension + /// If omitted, default to row-major order (C-like). + strides: [long]; + + /// The location and size of the tensor's data + data: Buffer (required); +} + +root_type Tensor; diff --git a/arrow-format/substrait/extension_types.yaml b/arrow-format/substrait/extension_types.yaml new file mode 100644 index 000000000..0073da1ac --- /dev/null +++ b/arrow-format/substrait/extension_types.yaml @@ -0,0 +1,170 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# substrait::{ExtensionTypeVariation, ExtensionType}s +# for wrapping types which appear in the arrow type system but +# are not first-class in substrait. These include: +# - null +# - unsigned integers +# - half-precision floating point numbers +# - 32-bit times and 64-bit dates +# - timestamps with units other than microseconds +# - timestamps with timezones other than UTC +# - 256-bit decimals +# - sparse and dense unions +# - dictionary encoded types +# - durations +# - string and binary with 64 bit offsets +# - list with 64-bit offsets +# - interval +# - interval +# - interval +# - arrow::ExtensionTypes +# +# These types fall into several categories of behavior: + +# Certain Arrow data types are, from Substrait's point of view, encodings. +# These include dictionary, the view types (e.g. binary view, list view), +# and REE. +# +# These types are not logically distinct from the type they are encoding. +# Specifically, the types meet the following criteria: +# * There is no value in the decoded type that cannot be represented +# as a value in the encoded type and vice versa. +# * Functions have the same meaning when applied to the encoded type +# +# Note: if two types have a different range (e.g. string and large_string) then +# they do not satisfy the above criteria and are not encodings. +# +# These types will never have a Substrait equivalent. In the Substrait point +# of view these are execution details. + +# The following types are encodings: + +# binary_view +# list_view +# dictionary +# ree + +# Arrow-cpp's Substrait serde does not yet handle parameterized UDTs. This means +# the following types are not yet supported but may be supported in the future. +# We define them below in case other implementations support them in the meantime. + +# decimal256 +# large_list +# fixed_size_list +# duration + +# Other types are not encodings, but are not first-class in Substrait. These +# types are often similar to existing Substrait types but define a different range +# of values. For example, unsigned integer types are very similar to their integer +# counterparts, but have a different range of values. These types are defined here +# as extension types. +# +# A full description of the types, along with their specified range, can be found +# in Schema.fbs +# +# Consumers should take care when supporting the below types. Should Substrait decide +# later to support these types, the consumer will need to make sure to continue supporting +# the extension type names as aliases for proper backwards compatibility. +types: + - name: "null" + structure: {} + - name: interval_month + structure: + months: i32 + - name: interval_day_milli + structure: + days: i32 + millis: i32 + - name: interval_month_day_nano + structure: + months: i32 + days: i32 + nanos: i64 + # All unsigned integer literals are encoded as user defined literals with + # a google.protobuf.UInt64Value message. + - name: u8 + structure: {} + - name: u16 + structure: {} + - name: u32 + structure: {} + - name: u64 + structure: {} + # fp16 literals are encoded as user defined literals with + # a google.protobuf.UInt32Value message where the lower 16 bits are + # the fp16 value. + - name: fp16 + structure: {} + # 64-bit integers are big. Even though date64 stores ms and not days it + # can still represent about 50x more dates than date32. Since it has a + # different range of values, it is an extension type. + # + # date64 literals are encoded as user defined literals with + # a google.protobuf.Int64Value message. + - name: date_millis + structure: {} + # time literals are encoded as user defined literals with + # a google.protobuf.Int32Value message (for time_seconds/time_millis) + # or a google.protobuf.Int64Value message (for time_nanos). + - name: time_seconds + structure: {} + - name: time_millis + structure: {} + - name: time_nanos + structure: {} + # Large string literals are encoded using a + # google.protobuf.StringValue message. + - name: large_string + structure: {} + # Large binary literals are encoded using a + # google.protobuf.BytesValue message. + - name: large_binary + structure: {} + # We cannot generate these today because they are parameterized UDTs and + # substrait-cpp does not yet support parameterized UDTs. + - name: decimal256 + structure: {} + parameters: + - name: precision + type: integer + min: 0 + max: 76 + - name: scale + type: integer + min: 0 + max: 76 + - name: large_list + structure: {} + parameters: + - name: value_type + type: dataType + - name: fixed_size_list + structure: {} + parameters: + - name: value_type + type: dataType + - name: dimension + type: integer + min: 0 + - name: duration + structure: {} + parameters: + - name: unit + type: string + diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh new file mode 100755 index 000000000..8441e00cc --- /dev/null +++ b/ci/scripts/java_build.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -eo pipefail + +if [[ "${ARROW_JAVA_BUILD:-ON}" != "ON" ]]; then + exit +fi + +arrow_dir=${1} +source_dir=${1} +build_dir=${2} +java_jni_dist_dir=${3} + +: ${BUILD_DOCS_JAVA:=OFF} + +mvn="mvn -B -DskipTests -Drat.skip=true -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" + +if [ $ARROW_JAVA_SKIP_GIT_PLUGIN ]; then + mvn="${mvn} -Dmaven.gitcommitid.skip=true" +fi + +# We want to do an out-of-source build since (when using docker) we'll pollute +# the source directory with files owned by root, but Maven does not really +# support this. Instead, copy directories to the build directory. + +rm -rf "${build_dir}" +mkdir -p "${build_dir}/arrow-format" +cp -r "${source_dir}/arrow-format" "${build_dir}" +cp -r "${source_dir}/dev" "${build_dir}" + +for source_root in $(find "${source_dir}" -not \( -path "${source_dir}"/build -prune \) -type f -name pom.xml -exec realpath -s --relative-to="${source_dir}" '{}' \; | + awk -F/ '{print $1}' | + sort -u); do + cp -r "${source_dir}/${source_root}" "${build_dir}" +done + +pushd "${build_dir}" + +if [ "${ARROW_JAVA_SHADE_FLATBUFFERS}" == "ON" ]; then + mvn="${mvn} -Pshade-flatbuffers" +fi + +if [ "${ARROW_JAVA_CDATA}" = "ON" ]; then + mvn="${mvn} -Darrow.c.jni.dist.dir=${java_jni_dist_dir} -Parrow-c-data" +fi + +if [ "${ARROW_JAVA_JNI}" = "ON" ]; then + mvn="${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} -Parrow-jni" +fi + +# Use `2 * ncores` threads +${mvn} -T 2C clean install + +if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then + # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 + # GH-43378: Maven site plugins not compatible with multithreading + mkdir -p ${build_dir}/docs/java/reference + ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site + rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference +fi + +popd diff --git a/ci/scripts/java_test.sh b/ci/scripts/java_test.sh new file mode 100755 index 000000000..9d4bc018b --- /dev/null +++ b/ci/scripts/java_test.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +if [[ "${ARROW_JAVA_TEST:-ON}" != "ON" ]]; then + exit +fi + +arrow_dir=${1} +source_dir=${1} +build_dir=${2} +java_jni_dist_dir=${3} + +mvn="mvn -B -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" +# Use `2 * ncores` threads +mvn="${mvn} -T 2C" + +pushd ${build_dir} + +${mvn} -Darrow.test.dataRoot="${source_dir}/testing/data" clean test + +projects=() +if [ "${ARROW_JAVA_JNI}" = "ON" ]; then + projects+=(adapter/orc) + projects+=(dataset) + projects+=(gandiva) +fi +if [ "${#projects[@]}" -gt 0 ]; then + ${mvn} clean test \ + -Parrow-jni \ + -pl $(IFS=,; echo "${projects[*]}") \ + -Darrow.cpp.build.dir=${java_jni_dist_dir} +fi + +if [ "${ARROW_JAVA_CDATA}" = "ON" ]; then + ${mvn} clean test -Parrow-c-data -pl c -Darrow.c.jni.dist.dir=${java_jni_dist_dir} +fi + +popd diff --git a/dataset/pom.xml b/dataset/pom.xml index 0c1f55dd6..21f67f1a6 100644 --- a/dataset/pom.xml +++ b/dataset/pom.xml @@ -183,7 +183,7 @@ under the License. --add-reads=org.apache.arrow.dataset=com.fasterxml.jackson.databind --add-opens=java.base/java.nio=org.apache.arrow.dataset,org.apache.arrow.memory.core,ALL-UNNAMED false - ${project.basedir}/../../testing/data + ${project.basedir}/../testing/data diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 000000000..103f2f3ad --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Usage +# ----- +# +# The docker compose file is parametrized using environment variables, the +# defaults are set in .env file. +# +# Example: +# $ ARCH=arm64v8 docker compose build java +# $ ARCH=arm64v8 docker compose run java + +volumes: + maven-cache: + name: maven-cache + +services: + java: + # Usage: + # docker compose build java + # docker compose run java + # Parameters: + # MAVEN: 3.9.6 + # JDK: 11, 17, 21 + image: ${ARCH}/maven:${MAVEN}-eclipse-temurin-${JDK} + volumes: &java-volumes + - .:/arrow-java:delegated + - ${DOCKER_VOLUME_PREFIX}maven-cache:/root/.m2:delegated + command: &java-command > + /bin/bash -c " + /arrow-java/ci/scripts/java_build.sh /arrow-java /build && + /arrow-java/ci/scripts/java_test.sh /arrow-java /build" diff --git a/flight/flight-core/pom.xml b/flight/flight-core/pom.xml index 374f6fcda..9dac97dd9 100644 --- a/flight/flight-core/pom.xml +++ b/flight/flight-core/pom.xml @@ -155,7 +155,7 @@ under the License. --add-opens=org.apache.arrow.flight.core/org.apache.arrow.flight.perf.impl=protobuf.java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED false - ${project.basedir}/../../../testing/data + ${project.basedir}/../../testing/data @@ -170,7 +170,7 @@ under the License. compile-custom - ${basedir}/../../../format/ + ${basedir}/../../arrow-format/ diff --git a/flight/flight-sql-jdbc-core/pom.xml b/flight/flight-sql-jdbc-core/pom.xml index fc033a5ea..8ad7e7680 100644 --- a/flight/flight-sql-jdbc-core/pom.xml +++ b/flight/flight-sql-jdbc-core/pom.xml @@ -144,7 +144,7 @@ under the License. false - ${project.basedir}/../../../testing/data + ${project.basedir}/../../testing/data diff --git a/testing b/testing new file mode 160000 index 000000000..4d209492d --- /dev/null +++ b/testing @@ -0,0 +1 @@ +Subproject commit 4d209492d514c2d3cb2d392681b9aa00e6d8da1c diff --git a/tools/src/test/java/org/apache/arrow/tools/TestIntegration.java b/tools/src/test/java/org/apache/arrow/tools/TestIntegration.java index 28f9a9010..935252287 100644 --- a/tools/src/test/java/org/apache/arrow/tools/TestIntegration.java +++ b/tools/src/test/java/org/apache/arrow/tools/TestIntegration.java @@ -23,6 +23,7 @@ import static org.apache.arrow.tools.ArrowFileTestFixtures.writeInput; import static org.apache.arrow.tools.ArrowFileTestFixtures.writeVariableWidthViewInput; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -34,6 +35,7 @@ import java.io.File; import java.io.IOException; import java.io.StringReader; +import java.net.URL; import java.util.Map; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; @@ -163,12 +165,9 @@ public void testValid() throws Exception { @Test public void testJSONRoundTripWithVariableWidth() throws Exception { - File testJSONFile = - new File("../../docs/source/format/integration_json_examples/simple.json") - .getCanonicalFile(); - if (!testJSONFile.exists()) { - testJSONFile = new File("../docs/source/format/integration_json_examples/simple.json"); - } + URL resource = getClass().getResource("/integration_json_simple.json"); + assertNotNull(resource); + File testJSONFile = new File(resource.getFile()).getCanonicalFile(); File testOutFile = new File(testFolder, "testOut.arrow"); File testRoundTripJSONFile = new File(testFolder, "testOut.json"); testOutFile.delete(); @@ -211,12 +210,9 @@ public void testJSONRoundTripWithVariableWidth() throws Exception { @Test public void testJSONRoundTripWithStruct() throws Exception { - File testJSONFile = - new File("../../docs/source/format/integration_json_examples/struct.json") - .getCanonicalFile(); - if (!testJSONFile.exists()) { - testJSONFile = new File("../docs/source/format/integration_json_examples/struct.json"); - } + URL resource = getClass().getResource("/integration_json_struct.json"); + assertNotNull(resource); + File testJSONFile = new File(resource.getFile()).getCanonicalFile(); File testOutFile = new File(testFolder, "testOutStruct.arrow"); File testRoundTripJSONFile = new File(testFolder, "testOutStruct.json"); testOutFile.delete(); diff --git a/tools/src/test/resources/integration_json_simple.json b/tools/src/test/resources/integration_json_simple.json new file mode 100644 index 000000000..663472919 --- /dev/null +++ b/tools/src/test/resources/integration_json_simple.json @@ -0,0 +1,98 @@ +{ + "schema": { + "fields": [ + { + "name": "foo", + "type": {"name": "int", "isSigned": true, "bitWidth": 32}, + "nullable": true, + "children": [] + }, + { + "name": "bar", + "type": {"name": "floatingpoint", "precision": "DOUBLE"}, + "nullable": true, + "children": [] + }, + { + "name": "baz", + "type": {"name": "utf8"}, + "nullable": true, + "children": [] + } + ] + }, + "batches": [ + { + "count": 5, + "columns": [ + { + "name": "foo", + "count": 5, + "VALIDITY": [1, 0, 1, 1, 1], + "DATA": [1, 2, 3, 4, 5] + }, + { + "name": "bar", + "count": 5, + "VALIDITY": [1, 0, 0, 1, 1], + "DATA": [1.0, 2.0, 3.0, 4.0, 5.0] + }, + { + "name": "baz", + "count": 5, + "VALIDITY": [1, 0, 0, 1, 1], + "OFFSET": [0, 2, 2, 2, 5, 9], + "DATA": ["aa", "", "", "bbb", "cccc"] + } + ] + }, + { + "count": 5, + "columns": [ + { + "name": "foo", + "count": 5, + "VALIDITY": [1, 1, 1, 1, 1], + "DATA": [1, 2, 3, 4, 5] + }, + { + "name": "bar", + "count": 5, + "VALIDITY": [1, 1, 1, 1, 1], + "DATA": [1.0, 2.0, 3.0, 4.0, 5.0] + }, + { + "name": "baz", + "count": 5, + "VALIDITY": [1, 1, 1, 1, 1], + "OFFSET": [0, 2, 3, 4, 7, 11], + "DATA": ["aa", "b", "c", "ddd", "eeee"] + } + ] + }, + { + "count": 5, + "columns": [ + { + "name": "foo", + "count": 5, + "VALIDITY": [0, 0, 0, 0, 0], + "DATA": [1, 2, 3, 4, 5] + }, + { + "name": "bar", + "count": 5, + "VALIDITY": [0, 0, 0, 0, 0], + "DATA": [1.0, 2.0, 3.0, 4.0, 5.0] + }, + { + "name": "baz", + "count": 5, + "VALIDITY": [0, 0, 0, 0, 0], + "OFFSET": [0, 0, 0, 0, 0, 0], + "DATA": ["", "", "", "", ""] + } + ] + } + ] +} diff --git a/tools/src/test/resources/integration_json_struct.json b/tools/src/test/resources/integration_json_struct.json new file mode 100644 index 000000000..4e6cc774e --- /dev/null +++ b/tools/src/test/resources/integration_json_struct.json @@ -0,0 +1,201 @@ +{ + "schema": { + "fields": [ + { + "name": "struct_nullable", + "type": { + "name": "struct" + }, + "nullable": true, + "children": [ + { + "name": "f1", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + }, + { + "name": "f2", + "type": { + "name": "utf8" + }, + "nullable": true, + "children": [] + } + ] + } + ] + }, + "batches": [ + { + "count": 7, + "columns": [ + { + "name": "struct_nullable", + "count": 7, + "VALIDITY": [ + 0, + 1, + 1, + 1, + 0, + 1, + 0 + ], + "children": [ + { + "name": "f1", + "count": 7, + "VALIDITY": [ + 1, + 0, + 1, + 1, + 1, + 0, + 0 + ], + "DATA": [ + 1402032511, + 290876774, + 137773603, + 410361374, + 1959836418, + 1995074679, + -163525262 + ] + }, + { + "name": "f2", + "count": 7, + "VALIDITY": [ + 0, + 1, + 1, + 1, + 0, + 1, + 0 + ], + "OFFSET": [ + 0, + 0, + 7, + 14, + 21, + 21, + 28, + 28 + ], + "DATA": [ + "", + "MhRNxD4", + "3F9HBxK", + "aVd88fp", + "", + "3loZrRf", + "" + ] + } + ] + } + ] + }, + { + "count": 10, + "columns": [ + { + "name": "struct_nullable", + "count": 10, + "VALIDITY": [ + 0, + 1, + 1, + 0, + 1, + 0, + 0, + 1, + 1, + 1 + ], + "children": [ + { + "name": "f1", + "count": 10, + "VALIDITY": [ + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0 + ], + "DATA": [ + -2041500147, + 1715692943, + -35444996, + 1425496657, + 112765084, + 1760754983, + 413888857, + 2039738337, + -1924327700, + 670528518 + ] + }, + { + "name": "f2", + "count": 10, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 0 + ], + "OFFSET": [ + 0, + 7, + 7, + 7, + 14, + 21, + 28, + 35, + 42, + 49, + 49 + ], + "DATA": [ + "AS5oARE", + "", + "", + "JGdagcX", + "78SLiRw", + "vbGf7OY", + "5uh5fTs", + "0ilsf82", + "LjS9MbU", + "" + ] + } + ] + } + ] + } + ] +} diff --git a/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java b/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java index 8037212aa..b394a667d 100644 --- a/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java +++ b/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java @@ -22,6 +22,7 @@ import java.io.File; import java.io.IOException; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; @@ -318,10 +319,9 @@ public void testWriteReadDecimalJSON() throws IOException { @Test public void testSetStructLength() throws IOException { - File file = new File("../../docs/source/format/integration_json_examples/struct.json"); - if (!file.exists()) { - file = new File("../docs/source/format/integration_json_examples/struct.json"); - } + URL resource = getClass().getResource("/integration_json_struct.json"); + assertNotNull(resource); + File file = new File(resource.getFile()); try (BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); JsonFileReader reader = new JsonFileReader(file, readerAllocator)) { diff --git a/vector/src/test/resources/integration_json_struct.json b/vector/src/test/resources/integration_json_struct.json new file mode 100644 index 000000000..4e6cc774e --- /dev/null +++ b/vector/src/test/resources/integration_json_struct.json @@ -0,0 +1,201 @@ +{ + "schema": { + "fields": [ + { + "name": "struct_nullable", + "type": { + "name": "struct" + }, + "nullable": true, + "children": [ + { + "name": "f1", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + }, + { + "name": "f2", + "type": { + "name": "utf8" + }, + "nullable": true, + "children": [] + } + ] + } + ] + }, + "batches": [ + { + "count": 7, + "columns": [ + { + "name": "struct_nullable", + "count": 7, + "VALIDITY": [ + 0, + 1, + 1, + 1, + 0, + 1, + 0 + ], + "children": [ + { + "name": "f1", + "count": 7, + "VALIDITY": [ + 1, + 0, + 1, + 1, + 1, + 0, + 0 + ], + "DATA": [ + 1402032511, + 290876774, + 137773603, + 410361374, + 1959836418, + 1995074679, + -163525262 + ] + }, + { + "name": "f2", + "count": 7, + "VALIDITY": [ + 0, + 1, + 1, + 1, + 0, + 1, + 0 + ], + "OFFSET": [ + 0, + 0, + 7, + 14, + 21, + 21, + 28, + 28 + ], + "DATA": [ + "", + "MhRNxD4", + "3F9HBxK", + "aVd88fp", + "", + "3loZrRf", + "" + ] + } + ] + } + ] + }, + { + "count": 10, + "columns": [ + { + "name": "struct_nullable", + "count": 10, + "VALIDITY": [ + 0, + 1, + 1, + 0, + 1, + 0, + 0, + 1, + 1, + 1 + ], + "children": [ + { + "name": "f1", + "count": 10, + "VALIDITY": [ + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0 + ], + "DATA": [ + -2041500147, + 1715692943, + -35444996, + 1425496657, + 112765084, + 1760754983, + 413888857, + 2039738337, + -1924327700, + 670528518 + ] + }, + { + "name": "f2", + "count": 10, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 0 + ], + "OFFSET": [ + 0, + 7, + 7, + 7, + 14, + 21, + 28, + 35, + 42, + 49, + 49 + ], + "DATA": [ + "AS5oARE", + "", + "", + "JGdagcX", + "78SLiRw", + "vbGf7OY", + "5uh5fTs", + "0ilsf82", + "LjS9MbU", + "" + ] + } + ] + } + ] + } + ] +}