Skip to content

Commit

Permalink
Fix a JNI bug in JSON parsing fixup (#15550)
Browse files Browse the repository at this point in the history
When parsing JSON in the current code if no columns can be parsed out of the data, then an empty table is returned. Earlier we put in a work around to this so that we could pass in the number of rows needed and the JSON parsing code would make a table of null values for it. This had some issues with structs and lists which needed an extended way to produce the null scalar. This adds in code to do just that.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: #15550
  • Loading branch information
revans2 authored Apr 26, 2024
1 parent 65c2b53 commit c62c5f6
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 3 deletions.
28 changes: 27 additions & 1 deletion java/src/main/java/ai/rapids/cudf/Schema.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

/**
* The schema of data to be read in.
Expand Down Expand Up @@ -221,6 +222,13 @@ public DType[] getChildTypes() {
return ret;
}

public int getNumChildren() {
if (childSchemas == null) {
return 0;
}
return childSchemas.size();
}

int[] getFlattenedNumChildren() {
flattenIfNeeded();
return flattenedCounts;
Expand All @@ -243,7 +251,25 @@ public boolean isStructOrHasStructDescendant() {
return false;
}

public static class Builder {
public HostColumnVector.DataType asHostDataType() {
if (topLevelType == DType.LIST) {
assert(childSchemas != null && childSchemas.size() == 1);
HostColumnVector.DataType element = childSchemas.get(0).asHostDataType();
return new HostColumnVector.ListType(true, element);
} else if (topLevelType == DType.STRUCT) {
if (childSchemas == null) {
return new HostColumnVector.StructType(true);
} else {
List<HostColumnVector.DataType> childTypes =
childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList());
return new HostColumnVector.StructType(true, childTypes);
}
} else {
return new HostColumnVector.BasicType(true, topLevelType);
}
}

public static class Builder {
private final DType topLevelType;
private final List<String> names;
private final List<Builder> types;
Expand Down
22 changes: 20 additions & 2 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -1220,8 +1220,26 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
columns[i] = tbl.getColumn(index).incRefCount();
}
} else {
try (Scalar s = Scalar.fromNull(types[i])) {
columns[i] = ColumnVector.fromScalar(s, rowCount);
if (types[i] == DType.LIST) {
Schema listSchema = schema.getChild(i);
Schema elementSchema = listSchema.getChild(0);
try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) {
columns[i] = ColumnVector.fromScalar(s, rowCount);
}
} else if (types[i] == DType.STRUCT) {
Schema structSchema = schema.getChild(i);
int numStructChildren = structSchema.getNumChildren();
DataType[] structChildrenTypes = new DataType[numStructChildren];
for (int j = 0; j < numStructChildren; j++) {
structChildrenTypes[j] = structSchema.getChild(j).asHostDataType();
}
try (Scalar s = Scalar.structFromNull(structChildrenTypes)) {
columns[i] = ColumnVector.fromScalar(s, rowCount);
}
} else {
try (Scalar s = Scalar.fromNull(types[i])) {
columns[i] = ColumnVector.fromScalar(s, rowCount);
}
}
}
}
Expand Down

0 comments on commit c62c5f6

Please sign in to comment.