Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow empty strings in arrow columns (as disctinct from null values) #103

Merged
merged 2 commits into from
May 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 14 additions & 33 deletions packages/perspective/src/cpp/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,39 +201,38 @@ namespace arrow {
}
}

template<typename T>
void
fill_col_dict(t_uint32 nrows, t_uint32 dsize, val dcol, val vkeys, t_col_sptr col, const char* destType)
fill_col_dict(val dictvec, t_col_sptr col)
{
// ptaylor: This assumes the dictionary is either a Binary or Utf8 Vector. Should it support other Vector types?
val vdata = dcol["values"];
val vdata = dictvec["values"];
t_int32 vsize = vdata["length"].as<t_int32>();
std::vector<t_uchar> data;
data.reserve(vsize);
data.resize(vsize);
vecFromTypedArray(vdata, data.data(), vsize);

val voffsets = dcol["valueOffsets"];
val voffsets = dictvec["valueOffsets"];
t_int32 osize = voffsets["length"].as<t_int32>();
std::vector<t_int32> offsets;
offsets.reserve(osize);
offsets.resize(osize);
vecFromTypedArray(voffsets, offsets.data(), osize);

// Get number of dictionary entries
t_uint32 dsize = dictvec["length"].as<t_uint32>();

t_vocab* vocab = col->_get_vocab();
t_str elem;

for (t_uint32 i = 0; i < dsize; ++i) {
t_int32 bidx = offsets[i];
std::size_t es = offsets[i+1] - bidx;
assert(es > 0);
elem.assign(reinterpret_cast<char*>(data.data())+bidx, es);
t_uindex idx = vocab->get_interned(elem);
// Make sure there are no duplicates in the arrow dictionary
assert(idx == i);
}

// Now process index keys into dictionary
arrow::vecFromTypedArray(vkeys, col->get_nth<T>(0), nrows, destType);
}
}

Expand Down Expand Up @@ -340,29 +339,15 @@ _fill_col<std::string>(val dcol, t_col_sptr col, t_bool is_arrow)
if (dcol["constructor"]["name"].as<t_str>() == "DictionaryVector") {

val dictvec = dcol["dictionary"];
arrow::fill_col_dict(dictvec, col);

// Get number of dictionary entries
t_uint32 dsize = dictvec["length"].as<t_uint32>();

val vkeys = dcol["indices"]["values"];
// Now process index into dictionary

// Perspective stores string indices in a 32bit unsigned array
// Javascript's typed arrays handle copying from various bitwidth arrays properly
auto width = vkeys["constructor"]["BYTES_PER_ELEMENT"].as<t_int32>();

switch (width) {
case 1:
arrow::fill_col_dict<t_int8>(nrows, dsize, dictvec, vkeys, col, "Uint32Array");
break;
case 2:
arrow::fill_col_dict<t_int16>(nrows, dsize, dictvec, vkeys, col, "Uint32Array");
break;
case 4:
arrow::fill_col_dict<t_int32>(nrows, dsize, dictvec, vkeys, col, "Uint32Array");
break;
default:
break;
}
val vkeys = dcol["indices"]["values"];
arrow::vecFromTypedArray(vkeys, col->get_nth<t_uindex>(0), nrows, "Uint32Array");

} else if (dcol["constructor"]["name"].as<t_str>() == "Utf8Vector" ||
dcol["constructor"]["name"].as<t_str>() == "BinaryVector") {

Expand All @@ -385,12 +370,8 @@ _fill_col<std::string>(val dcol, t_col_sptr col, t_bool is_arrow)
for (t_int32 i = 0; i < nrows; ++i) {
t_int32 bidx = offsets[i];
std::size_t es = offsets[i+1] - bidx;
if (es > 0) {
elem.assign(reinterpret_cast<char*>(data.data())+bidx, es);
col->set_nth(i, elem);
} else {
col->clear(i);
}
elem.assign(reinterpret_cast<char*>(data.data())+bidx, es);
col->set_nth(i, elem);
}
}
} else {
Expand Down
Binary file modified packages/perspective/test/arrow/test-null.arrow
Binary file not shown.
2 changes: 1 addition & 1 deletion packages/perspective/test/js/constructors.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ var arrow_result = [
"datetime(ms)": +(new Date("2018-01-26")), "datetime(us)": +(new Date("2018-01-26")), "datetime(ns)": +(new Date("2018-01-26"))},
{"f32": 3.5, "f64": 3.5, "i64": 3, "i32": 3, "i16": 3, "i8": 3, "bool": true, "char": "c", "dict": "c",
"datetime(ms)": +(new Date("2018-01-27")), "datetime(us)": +(new Date("2018-01-27")), "datetime(ns)": +(new Date("2018-01-27"))},
{"f32": 4.5, "f64": 4.5, "i64": 4, "i32": 4, "i16": 4, "i8": 4, "bool": false, "char": "d", "dict": "d",
{"f32": 4.5, "f64": 4.5, "i64": 4, "i32": 4, "i16": 4, "i8": 4, "bool": false, "char": "", "dict": "",
"datetime(ms)": +(new Date("2018-01-28")), "datetime(us)": +(new Date("2018-01-28")), "datetime(ns)": +(new Date("2018-01-28"))},
{"f32": null, "f64": null, "i64": null, "i32": null, "i16": null, "i8": null, "bool": null, "char": null, "dict": null,
"datetime(ms)": null, "datetime(us)": null, "datetime(ns)": null}
Expand Down