Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] JSON tree algorithms refactor II: Constructing device JSON column #16205

Draft
wants to merge 40 commits into
base: branch-24.10
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
1ec9617
added csr data struct
shrshi Jun 11, 2024
022d7ce
formatting
shrshi Jun 11, 2024
382633f
added test
shrshi Jun 25, 2024
1823854
formatting
shrshi Jun 25, 2024
4a7e2a5
Merge branch 'branch-24.08' into json-tree-refactor-ii
shrshi Jun 25, 2024
8d5ddfb
Merge branch 'branch-24.08' into json-tree-refactor
shrshi Jun 26, 2024
84a7749
fixing csr construction
shrshi Jun 28, 2024
810c389
moving the csr algorithms
shrshi Jun 28, 2024
6a1a415
formatting
shrshi Jun 28, 2024
d3468ba
Merge branch 'branch-24.08' into json-tree-refactor-ii
shrshi Jun 28, 2024
85c197d
Merge branch 'branch-24.08' into json-tree-refactor
shrshi Jun 28, 2024
996c6dd
Merge branch 'json-tree-refactor' of github.com:shrshi/cudf into json…
shrshi Jun 28, 2024
3675140
ignoring leaf nodes with non-leaf siblings
shrshi Jul 5, 2024
389df50
formatting
shrshi Jul 6, 2024
4bba629
moving to experimental namespace
shrshi Jul 15, 2024
25530f6
Merge branch 'branch-24.08' into json-tree-refactor
shrshi Jul 15, 2024
df9e65b
formatting
shrshi Jul 15, 2024
d1588c8
removed node properties from csr struct - will be introduced in stage…
shrshi Jul 15, 2024
b809703
partial commit before merge
shrshi Jul 16, 2024
c576370
added two validation checks for column tree
shrshi Jul 17, 2024
b04cebc
formatting
shrshi Jul 17, 2024
b804209
partial work commit
shrshi Jul 21, 2024
b8e8c07
formatting
shrshi Jul 22, 2024
7e1a756
merging branch 24.08 into current branch
shrshi Jul 24, 2024
5541b93
partial commit
shrshi Jul 24, 2024
1490ce9
Merge branch 'branch-24.10' into json-tree-refactor
shrshi Jul 24, 2024
d05e670
better csr construction
shrshi Jul 30, 2024
1ce88be
formatting
shrshi Jul 30, 2024
d6d724c
exec policy is no sync
shrshi Jul 30, 2024
2622d6b
fix copyright year
shrshi Jul 30, 2024
9498372
fixing max row offsets
shrshi Jul 31, 2024
4339b0a
formatting
shrshi Jul 31, 2024
e61288b
Merge branch 'branch-24.10' into json-tree-refactor
shrshi Jul 31, 2024
9b6b7ff
struct docs
shrshi Jul 31, 2024
53db174
Merge branch 'json-tree-refactor' of github.com:shrshi/cudf into json…
shrshi Jul 31, 2024
85608eb
cudf exports!
shrshi Jul 31, 2024
3c21e04
merge after 15979 update
shrshi Aug 1, 2024
3900ee3
refactoring after the csr updates
shrshi Aug 2, 2024
3949cda
minor fixes
shrshi Aug 2, 2024
4d88fe5
formatting
shrshi Aug 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,8 @@ add_library(
src/io/functions.cpp
src/io/json/byte_range_info.cu
src/io/json/json_column.cu
src/io/json/column_tree_construction.cu
src/io/json/device_column_tree_extraction.cu
src/io/json/json_normalization.cu
src/io/json/json_tree.cu
src/io/json/nested_json_gpu.cu
Expand Down
553 changes: 553 additions & 0 deletions cpp/src/io/json/column_tree_construction.cu

Large diffs are not rendered by default.

1,046 changes: 1,046 additions & 0 deletions cpp/src/io/json/device_column_tree_extraction.cu

Large diffs are not rendered by default.

843 changes: 1 addition & 842 deletions cpp/src/io/json/json_column.cu

Large diffs are not rendered by default.

747 changes: 747 additions & 0 deletions cpp/src/io/json/json_column_csr.cu

Large diffs are not rendered by default.

51 changes: 1 addition & 50 deletions cpp/src/io/json/json_tree.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include "io/utilities/hostdevice_vector.hpp"
#include "json_utils.hpp"
#include "nested_json.hpp"

#include <cudf/detail/cuco_helpers.hpp>
Expand All @@ -33,7 +34,6 @@
#include <rmm/exec_policy.hpp>
#include <rmm/resource_ref.hpp>

#include <cub/device/device_radix_sort.cuh>
#include <cuco/static_set.cuh>
#include <cuda/functional>
#include <thrust/binary_search.h>
Expand Down Expand Up @@ -139,55 +139,6 @@ struct is_nested_end {
}
};

/**
* @brief Returns stable sorted keys and its sorted order
*
* Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory.
* Since the key and order is returned, using double buffer helps to avoid extra copy to user
* provided output iterator.
*
* @tparam IndexType sorted order type
* @tparam KeyType key type
* @param keys keys to sort
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return Sorted keys and indices producing that sorted order
*/
template <typename IndexType = size_t, typename KeyType>
std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
cudf::device_span<KeyType const> keys, rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();

// Determine temporary device storage requirements
rmm::device_uvector<KeyType> keys_buffer1(keys.size(), stream);
rmm::device_uvector<KeyType> keys_buffer2(keys.size(), stream);
rmm::device_uvector<IndexType> order_buffer1(keys.size(), stream);
rmm::device_uvector<IndexType> order_buffer2(keys.size(), stream);
cub::DoubleBuffer<IndexType> order_buffer(order_buffer1.data(), order_buffer2.data());
cub::DoubleBuffer<KeyType> keys_buffer(keys_buffer1.data(), keys_buffer2.data());
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairs(
nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size());
rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);

thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin());
thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end());

cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
temp_storage_bytes,
keys_buffer,
order_buffer,
keys.size(),
0,
sizeof(KeyType) * 8,
stream.value());

return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1)
: std::move(keys_buffer2),
order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1)
: std::move(order_buffer2)};
}

/**
* @brief Propagate parent node from first sibling to other siblings.
*
Expand Down
82 changes: 82 additions & 0 deletions cpp/src/io/json/json_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/io/detail/tokenize_json.hpp>
#include <cudf/io/types.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/bit.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/exec_policy.hpp>
#include <rmm/resource_ref.hpp>

#include <cub/device/device_radix_sort.cuh>
#include <thrust/sequence.h>

namespace cudf::io::json::detail {
/**
* @brief Returns stable sorted keys and its sorted order
*
* Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory.
* Since the key and order is returned, using double buffer helps to avoid extra copy to user
* provided output iterator.
*
* @tparam IndexType sorted order type
* @tparam KeyType key type
* @param keys keys to sort
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return Sorted keys and indices producing that sorted order
*/
template <typename IndexType = size_t, typename KeyType>
std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
cudf::device_span<KeyType const> keys, rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();

// Determine temporary device storage requirements
rmm::device_uvector<KeyType> keys_buffer1(keys.size(), stream);
rmm::device_uvector<KeyType> keys_buffer2(keys.size(), stream);
rmm::device_uvector<IndexType> order_buffer1(keys.size(), stream);
rmm::device_uvector<IndexType> order_buffer2(keys.size(), stream);
cub::DoubleBuffer<IndexType> order_buffer(order_buffer1.data(), order_buffer2.data());
cub::DoubleBuffer<KeyType> keys_buffer(keys_buffer1.data(), keys_buffer2.data());
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairs(
nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size());
rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);

thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin());
thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end());

cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
temp_storage_bytes,
keys_buffer,
order_buffer,
keys.size(),
0,
sizeof(KeyType) * 8,
stream.value());

return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1)
: std::move(keys_buffer2),
order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1)
: std::move(order_buffer2)};
}

} // namespace cudf::io::json::detail
190 changes: 165 additions & 25 deletions cpp/src/io/json/nested_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/export.hpp>

#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>
#include <rmm/resource_ref.hpp>

#include <thrust/sequence.h>

#include <map>
#include <vector>

Expand Down Expand Up @@ -52,24 +57,6 @@ struct tree_meta_t {
*/
enum class json_col_t : char { ListColumn, StructColumn, StringColumn, Unknown };

/**
* @brief Enum class to specify whether we just push onto and pop from the stack or whether we also
* reset to an empty stack on a newline character.
*/
enum class stack_behavior_t : char {
/// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop
/// from the stack
PushPopWithoutReset,

/// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop
/// from the stack. Delimiter characters are passed when the stack context is constructed to
/// reset to an empty stack.
ResetOnDelimiter
};

// Default name for a list's child column
constexpr auto list_child_name{"element"};

/**
* @brief Intermediate representation of data from a nested JSON input
*/
Expand Down Expand Up @@ -185,6 +172,120 @@ struct device_json_column {
{
}
};
/**
* @brief Holds member data pointers of `d_json_column`
*
*/
struct json_column_data {
using row_offset_t = json_column::row_offset_t;
row_offset_t* string_offsets;
row_offset_t* string_lengths;
row_offset_t* child_offsets;
bitmask_type* validity;
};

/**
* @brief Enum class to specify whether we just push onto and pop from the stack or whether we also
* reset to an empty stack on a newline character.
*/
enum class stack_behavior_t : char {
/// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop
/// from the stack
PushPopWithoutReset,

/// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop
/// from the stack. Delimiter characters are passed when the stack context is constructed to
/// reset to an empty stack.
ResetOnDelimiter
};

// Default name for a list's child column
constexpr auto list_child_name{"element"};

namespace experimental {
using row_offset_t = size_type;
/*
* @brief Sparse graph adjacency matrix stored in Compressed Sparse Row (CSR) format.
*/
struct csr {
rmm::device_uvector<NodeIndexT> rowidx;
rmm::device_uvector<NodeIndexT> colidx;
};

/*
* @brief Auxiliary column tree properties that are required to construct the device json
* column subtree, but not required for the final cudf column construction.
*/
struct column_tree_properties {
rmm::device_scalar<NodeIndexT> num_levels;
rmm::device_uvector<NodeT> categories;
rmm::device_uvector<row_offset_t> max_row_offsets;
rmm::device_uvector<NodeIndexT> mapped_ids;
};

/*
* @brief Positional and validity information for subgraph extracted from column tree. The offsets
* for each node in the member arrays can be obtained by segmented sums of the max_row_offsets array
* in column_tree_properties.
*/
struct device_column_subtree_properties {
rmm::device_uvector<SymbolOffsetT> string_offsets;
rmm::device_uvector<SymbolOffsetT> string_lengths;
// Row offsets
rmm::device_uvector<SymbolOffsetT> child_offsets;
// Validity bitmap
rmm::device_buffer validity;
};

/*
* @brief Unvalidated column tree stored in Compressed Sparse Row (CSR) format. The device json
* column subtree - the subgraph that conforms to column tree properties - is extracted and further
* processed according to the JSON reader options passed. Only the final processed subgraph is
* annotated with information required to construct cuDF columns.
*/
struct column_tree {
// position of nnzs
csr adjacency;
// device_json_column properties
// Indicator array for the device column subtree
// Stores the number of rows in the column if the node is part of device column subtree
// Stores zero otherwise
rmm::device_uvector<row_offset_t> subtree_nrows;
device_column_subtree_properties d_props;
};

namespace detail {
/**
* @brief Reduce node tree into column tree by aggregating each property of column.
*
* @param tree json node tree to reduce (modified in-place, but restored to original state)
* @param col_ids column ids of each node (modified in-place, but restored to original state)
* @param row_offsets row offsets of each node (modified in-place, but restored to original state)
* @param stream The CUDA stream to which kernels are dispatched
* @return A tuple containing the column tree, identifier for each column and the maximum row index
* in each column
*/
CUDF_EXPORT
std::tuple<csr, column_tree_properties> reduce_to_column_tree(
tree_meta_t& tree,
device_span<NodeIndexT> original_col_ids,
device_span<size_type> row_offsets,
bool is_array_of_arrays,
NodeIndexT const row_array_parent_col_id,
rmm::cuda_stream_view stream);

void make_device_json_column(device_span<SymbolT const> input,
tree_meta_t& tree,
device_span<NodeIndexT> col_ids,
device_span<size_type> row_offsets,
device_json_column& root,
bool is_array_of_arrays,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

} // namespace detail
} // namespace experimental

namespace detail {

Expand Down Expand Up @@ -285,20 +386,30 @@ get_array_children_indices(TreeDepthT row_array_children_level,
device_span<TreeDepthT const> node_levels,
device_span<NodeIndexT const> parent_node_ids,
rmm::cuda_stream_view stream);

/**
* @brief Reduce node tree into column tree by aggregating each property of column.
*
* @param tree json node tree to reduce (modified in-place, but restored to original state)
* @param col_ids column ids of each node (modified in-place, but restored to original state)
* @param row_offsets row offsets of each node (modified in-place, but restored to original state)
* @param stream The CUDA stream to which kernels are dispatched
* @return A tuple containing the column tree, identifier for each column and the maximum row index
* in each column
* @param tree Node tree representation of JSON string
* @param original_col_ids Column ids of nodes
* @param sorted_col_ids Sorted column ids of nodes
* @param ordered_node_ids Node ids of nodes sorted by column ids
* @param row_offsets Row offsets of nodes
* @param is_array_of_arrays Whether the tree is an array of arrays
* @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
* @param stream CUDA stream used for device memory operations and kernel launches
* @return A tuple of column tree representation of JSON string, column ids of columns, and
* max row offsets of columns
*/
CUDF_EXPORT
std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
reduce_to_column_tree(tree_meta_t& tree,
device_span<NodeIndexT> col_ids,
device_span<NodeIndexT> original_col_ids,
device_span<NodeIndexT> sorted_col_ids,
device_span<NodeIndexT> ordered_node_ids,
device_span<size_type> row_offsets,
bool is_array_of_arrays,
NodeIndexT const row_array_parent_col_id,
rmm::cuda_stream_view stream);

/**
Expand All @@ -311,6 +422,35 @@ reduce_to_column_tree(tree_meta_t& tree,
cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream);

/**
* @brief Constructs `d_json_column` from node tree representation
* Newly constructed columns are insert into `root`'s children.
* `root` must be a list type.
*
* @param input Input JSON string device data
* @param tree Node tree representation of the JSON string
* @param col_ids Column ids of the nodes in the tree
* @param row_offsets Row offsets of the nodes in the tree
* @param root Root node of the `d_json_column` tree
* @param is_array_of_arrays Whether the tree is an array of arrays
* @param options Parsing options specifying the parsing behaviour
* options affecting behaviour are
* is_enabled_lines: Whether the input is a line-delimited JSON
* is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the device memory
* of child_offets and validity members of `d_json_column`
*/
void make_device_json_column(device_span<SymbolT const> input,
tree_meta_t& tree,
device_span<NodeIndexT> col_ids,
device_span<size_type> row_offsets,
device_json_column& root,
bool is_array_of_arrays,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

/**
* @brief Parses the given JSON string and generates table from the given input.
*
Expand Down
Loading
Loading