format/SparseTensor.fbs

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

/// EXPERIMENTAL: Metadata for n-dimensional sparse arrays, aka "sparse tensors".
/// Arrow implementations in general are not required to implement this type

include "Tensor.fbs";

namespace org.apache.arrow.flatbuf;

/// ----------------------------------------------------------------------
/// EXPERIMENTAL: Data structures for sparse tensors

/// Coordinate (COO) format of sparse tensor index.
///
/// COO's index list are represented as a NxM matrix,
/// where N is the number of non-zero values,
/// and M is the number of dimensions of a sparse tensor.
///
/// indicesBuffer stores the location and size of the data of this indices
/// matrix.  The value type and the stride of the indices matrix is
/// specified in indicesType and indicesStrides fields.
///
/// For example, let X be a 2x3x4x5 tensor, and it has the following
/// 6 non-zero values:
/// ```text
///   X[0, 1, 2, 0] := 1
///   X[1, 1, 2, 3] := 2
///   X[0, 2, 1, 0] := 3
///   X[0, 1, 3, 0] := 4
///   X[0, 1, 2, 1] := 5
///   X[1, 2, 0, 4] := 6
/// ```
/// In COO format, the index matrix of X is the following 4x6 matrix:
/// ```text
///   [[0, 0, 0, 0, 1, 1],
///    [1, 1, 1, 2, 1, 2],
///    [2, 2, 3, 1, 2, 0],
///    [0, 1, 0, 0, 3, 4]]
/// ```
/// When isCanonical is true, the indices is sorted in lexicographical order
/// (row-major order), and it does not have duplicated entries.  Otherwise,
/// the indices may not be sorted, or may have duplicated entries.
table SparseTensorIndexCOO {
  /// The type of values in indicesBuffer
  indicesType: Int (required);

  /// Non-negative byte offsets to advance one value cell along each dimension
  /// If omitted, default to row-major order (C-like).
  indicesStrides: [long];

  /// The location and size of the indices matrix's data
  indicesBuffer: Buffer (required);

  /// This flag is true if and only if the indices matrix is sorted in
  /// row-major order, and does not have duplicated entries.
  /// This sort order is the same as of Tensorflow's SparseTensor,
  /// but it is inverse order of SciPy's canonical coo_matrix
  /// (SciPy employs column-major order for its coo_matrix).
  isCanonical: bool;
}

enum SparseMatrixCompressedAxis: short { Row, Column }

/// Compressed Sparse format, that is matrix-specific.
table SparseMatrixIndexCSX {
  /// Which axis, row or column, is compressed
  compressedAxis: SparseMatrixCompressedAxis;

  /// The type of values in indptrBuffer
  indptrType: Int (required);

  /// indptrBuffer stores the location and size of indptr array that
  /// represents the range of the rows.
  /// The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data.
  /// The length of this array is 1 + (the number of rows), and the type
  /// of index value is long.
  ///
  /// For example, let X be the following 6x4 matrix:
  /// ```text
  ///   X := [[0, 1, 2, 0],
  ///         [0, 0, 3, 0],
  ///         [0, 4, 0, 5],
  ///         [0, 0, 0, 0],
  ///         [6, 0, 7, 8],
  ///         [0, 9, 0, 0]].
  /// ```
  /// The array of non-zero values in X is:
  /// ```text
  ///   values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9].
  /// ```
  /// And the indptr of X is:
  /// ```text
  ///   indptr(X) = [0, 2, 3, 5, 5, 8, 10].
  /// ```
  indptrBuffer: Buffer (required);

  /// The type of values in indicesBuffer
  indicesType: Int (required);

  /// indicesBuffer stores the location and size of the array that
  /// contains the column indices of the corresponding non-zero values.
  /// The type of index value is long.
  ///
  /// For example, the indices of the above X is:
  /// ```text
  ///   indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1].
  /// ```
  /// Note that the indices are sorted in lexicographical order for each row.
  indicesBuffer: Buffer (required);
}

/// Compressed Sparse Fiber (CSF) sparse tensor index.
table SparseTensorIndexCSF {
  /// CSF is a generalization of compressed sparse row (CSR) index.
  /// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf)
  ///
  /// CSF index recursively compresses each dimension of a tensor into a set
  /// of prefix trees. Each path from a root to leaf forms one tensor
  /// non-zero index. CSF is implemented with two arrays of buffers and one
  /// arrays of integers.
  ///
  /// For example, let X be a 2x3x4x5 tensor and let it have the following
  /// 8 non-zero values:
  /// ```text
  ///   X[0, 0, 0, 1] := 1
  ///   X[0, 0, 0, 2] := 2
  ///   X[0, 1, 0, 0] := 3
  ///   X[0, 1, 0, 2] := 4
  ///   X[0, 1, 1, 0] := 5
  ///   X[1, 1, 1, 0] := 6
  ///   X[1, 1, 1, 1] := 7
  ///   X[1, 1, 1, 2] := 8
  /// ```
  /// As a prefix tree this would be represented as:
  /// ```text
  ///         0          1
  ///        / \         |
  ///       0   1        1
  ///      /   / \       |
  ///     0   0   1      1
  ///    /|  /|   |    /| |
  ///   1 2 0 2   0   0 1 2
  /// ```
  /// The type of values in indptrBuffers
  indptrType: Int (required);

  /// indptrBuffers stores the sparsity structure.
  /// Each two consecutive dimensions in a tensor correspond to a buffer in
  /// indptrBuffers. A pair of consecutive values at `indptrBuffers[dim][i]`
  /// and `indptrBuffers[dim][i + 1]` signify a range of nodes in
  /// `indicesBuffers[dim + 1]` who are children of `indicesBuffers[dim][i]` node.
  ///
  /// For example, the indptrBuffers for the above X is:
  /// ```text
  ///   indptrBuffer(X) = [
  ///                       [0, 2, 3],
  ///                       [0, 1, 3, 4],
  ///                       [0, 2, 4, 5, 8]
  ///                     ].
  /// ```
  indptrBuffers: [Buffer] (required);

  /// The type of values in indicesBuffers
  indicesType: Int (required);

  /// indicesBuffers stores values of nodes.
  /// Each tensor dimension corresponds to a buffer in indicesBuffers.
  /// For example, the indicesBuffers for the above X is:
  /// ```text
  ///   indicesBuffer(X) = [
  ///                        [0, 1],
  ///                        [0, 1, 1],
  ///                        [0, 0, 1, 1],
  ///                        [1, 2, 0, 2, 0, 0, 1, 2]
  ///                      ].
  /// ```
  indicesBuffers: [Buffer] (required);

  /// axisOrder stores the sequence in which dimensions were traversed to
  /// produce the prefix tree.
  /// For example, the axisOrder for the above X is:
  /// ```text
  ///   axisOrder(X) = [0, 1, 2, 3].
  /// ```
  axisOrder: [int] (required);
}

union SparseTensorIndex {
  SparseTensorIndexCOO,
  SparseMatrixIndexCSX,
  SparseTensorIndexCSF
}

table SparseTensor {
  /// The type of data contained in a value cell.
  /// Currently only fixed-width value types are supported,
  /// no strings or nested types.
  type: Type (required);

  /// The dimensions of the tensor, optionally named.
  shape: [TensorDim] (required);

  /// The number of non-zero values in a sparse tensor.
  non_zero_length: long;

  /// Sparse tensor index
  sparseIndex: SparseTensorIndex (required);

  /// The location and size of the tensor's data
  data: Buffer (required);
}

root_type SparseTensor;