Skip to content

Commit

Permalink
Add benchmark support, add python tests, add JSON array -> JSONL conv…
Browse files Browse the repository at this point in the history
…erter
  • Loading branch information
timkpaine committed Sep 28, 2023
1 parent cd458dd commit 775be00
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 8 deletions.
33 changes: 33 additions & 0 deletions cpp/perspective/src/cpp/arrow_format.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
#include <perspective/arrow_format.h>
#include <arrow/util/value_parsing.h>
#include <arrow/io/memory.h>
#include <rapidjson/document.h>
#include "rapidjson/writer.h"
#include "rapidjson/stringbuffer.h"

#ifdef PSP_ENABLE_WASM
// This causes build warnings
Expand Down Expand Up @@ -437,6 +440,36 @@ namespace apachearrow {
jsonToTable(std::string& json, bool is_update,
std::unordered_map<std::string, std::shared_ptr<arrow::DataType>>&
schema) {

// NOTE: Arrow only supports JSONL/NDJSON as of 12.0.0, so convert if needed
// This incurs some overhead, but in general it should still be better in C++
// than doing from host language.
if(json[0] == '[') {
rapidjson::Document document;
document.Parse(json.c_str());

if(!document.IsArray()) {
PSP_COMPLAIN_AND_ABORT("Unable to convert detected JSON array to JSONL");
}

std::string new_json;
rapidjson::StringBuffer buffer;
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
auto arr = document.GetArray();
for (rapidjson::Value::ConstValueIterator itr = arr.Begin(); itr != arr.End(); ++itr) {
if(!itr->IsObject()) {
PSP_COMPLAIN_AND_ABORT("Unable to convert detected JSON array to JSONL - Values of JSON array must be objects");
}

itr->Accept(writer);
new_json += buffer.GetString();
new_json += "\n";
buffer.Clear();
}

json = new_json;
}

arrow::io::IOContext io_context = arrow::io::default_io_context();
auto input = std::make_shared<arrow::io::BufferReader>(json);
auto read_options = arrow::json::ReadOptions::Defaults();
Expand Down
12 changes: 12 additions & 0 deletions python/perspective/bench/runtime/perspective_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def register_benchmarks(self):
`Runner` can find the tests at runtime."""
self.benchmark_table_arrow()
self.benchmark_table_csv()
self.benchmark_table_json()
self.benchmark_view_zero()
self.benchmark_view_one()
self.benchmark_view_two()
Expand All @@ -101,6 +102,14 @@ def benchmark_table_csv(self):
func = Benchmark(lambda: Table(csv), meta=test_meta)
setattr(self, "table_csv", func)

def benchmark_table_json(self):
"""Benchmark table from json separately as it requires opening the
Arrow file from the filesystem."""
json = self._view.to_json()
test_meta = make_meta("table", "json")
func = Benchmark(lambda: Table(json), meta=test_meta)
setattr(self, "table_json", func)

def benchmark_view_zero(self):
"""Benchmark view creation with zero pivots."""
func = Benchmark(lambda: self._table.view(), meta=make_meta("view", "zero"))
Expand Down Expand Up @@ -184,6 +193,7 @@ def benchmark_to_format_zero(self):
for name in (
"arrow",
"csv",
"json",
"columns",
"records",
):
Expand All @@ -197,6 +207,7 @@ def benchmark_to_format_one(self):
for name in (
"arrow",
"csv",
"json",
"columns",
"records",
):
Expand All @@ -214,6 +225,7 @@ def benchmark_to_format_two(self):
for name in (
"arrow",
"csv",
"json",
"columns",
"records",
):
Expand Down
9 changes: 1 addition & 8 deletions python/perspective/perspective/src/table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,15 +141,8 @@ namespace binding {
csv_or_json_as_string = "{}";
}

if (csv_or_json_as_string[0] == '[') {
// TODO TKP
// JSON loader accepts JSONL / NDJSON, not JSON array
// Should we throw or try to reparse?
csv_or_json_as_string = "{}";
}

// Now parse the JSON / CSV
if (csv_or_json_as_string[0] == '{') {
if (csv_or_json_as_string[0] == '{' || csv_or_json_as_string[0] == '[') {
arrow_loader.init_json(csv_or_json_as_string, is_update, map);
} else {
arrow_loader.init_csv(csv_or_json_as_string, is_update, map);
Expand Down
36 changes: 36 additions & 0 deletions python/perspective/perspective/tests/table/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,42 @@ def test_table_correct_csv_nan_intermittent(self):
assert tbl.size() == 3
assert tbl.view().to_dict() == {"str": ["abc", "", "ghi"], "float": [None, 2.5, None]}

def test_table_json(self):
data = """{"a": 1, "b": 3.4, "c": "abc", "d": true}
{"a": 2, "b": 4.5, "c": "def", "d": true}
{"a": 3, "b": 6.7, "c": "ghi", "d": false}"""
tbl = Table(data)
assert tbl.schema() == {"a": int, "b": float, "c": str, "d": bool}
view = tbl.view()
assert view.to_dict() == {"a": [1, 2, 3], "b": [3.4, 4.5, 6.7], "c": ["abc", "def", "ghi"], "d": [True, True, False]}

def test_table_json_with_nulls(self):
data = """{"a": 1, "b": 3.4, "c": "abc", "d": true}
{"a": 2, "b": null, "c": "def", "d": true}
{"a": 3, "b": 6.7, "c": null, "d": false}"""
tbl = Table(data)
assert tbl.schema() == {"a": int, "b": float, "c": str, "d": bool}
view = tbl.view()
assert view.to_dict() == {"a": [1, 2, 3], "b": [3.4, None, 6.7], "c": ["abc", "def", None], "d": [True, True, False]}

def test_table_json_with_nulls_updated(self):
data = """{"a": 1, "b": null}"""
tbl = Table(data, index="a")
assert tbl.schema() == {"a": int, "b": str}
view = tbl.view()
assert view.to_dict() == {"a": [1], "b": [None]}
tbl.update("""{"a": 2, "b": "hey"}""")
assert view.to_dict() == {"a": [1, 2], "b": [None, "hey"]}

def test_table_json_array_type(self):
data = """[{"a": 1, "b": 3.4, "c": "abc", "d": true},{"a": 2, "b": 4.5, "c": "def", "d": true}, {"a": 3, "b": 6.7, "c": "ghi", "d": false}]"""
tbl = Table(data)
assert tbl.schema() == {"a": int, "b": float, "c": str, "d": bool}
view = tbl.view()
assert view.to_dict() == {"a": [1, 2, 3], "b": [3.4, 4.5, 6.7], "c": ["abc", "def", "ghi"], "d": [True, True, False]}
tbl.update("""[{"a": 4, "b": 9.9, "c": "yep", "d": true}]""")
assert view.to_dict() == {"a": [1, 2, 3, 4], "b": [3.4, 4.5, 6.7, 9.9], "c": ["abc", "def", "ghi", "yep"], "d": [True, True, False, True]}

def test_table_string_column_with_nulls_update_and_filter(self):
tbl = Table([{"a": "1", "b": 2, "c": "3"}, {"a": "2", "b": 3, "c": "4"}, {"a": "3", "b": 3, "c": None}], index="a")
view = tbl.view(filter=[["c", "==", "4"]])
Expand Down

0 comments on commit 775be00

Please sign in to comment.