From 4d9a9e0bebe3c76c6eb3df3c96a6eef915790af7 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 24 Oct 2024 11:41:59 -0700 Subject: [PATCH 1/4] Remove `gatherJSONColumns` --- java/src/main/java/ai/rapids/cudf/Table.java | 279 +------------------ java/src/main/native/src/TableJni.cpp | 24 +- 2 files changed, 23 insertions(+), 280 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index dbee53640aa..6bc3082d1d3 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1092,224 +1092,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) { return readJSON(schema, opts, buffer, 0, buffer.length); } - private static class DidViewChange { - ColumnVector changeWasNeeded = null; - boolean noChangeNeeded = false; - - public static DidViewChange yes(ColumnVector cv) { - DidViewChange ret = new DidViewChange(); - ret.changeWasNeeded = cv; - return ret; - } - - public static DidViewChange no() { - DidViewChange ret = new DidViewChange(); - ret.noChangeNeeded = true; - return ret; - } - } - - private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.NestedChildren children, - ColumnView cv) { - // We need to do this recursively to be sure it all matches as expected. - // If we run into problems where the data types don't match, we are not - // going to fix up the data types. We are only going to reorder the columns. - if (schema.getType() == DType.STRUCT) { - if (cv.getType() != DType.STRUCT) { - // The types don't match so just return the input unchanged... - return DidViewChange.no(); - } else { - String[] foundNames; - if (children == null) { - foundNames = new String[0]; - } else { - foundNames = children.getNames(); - } - HashMap indices = new HashMap<>(); - for (int i = 0; i < foundNames.length; i++) { - indices.put(foundNames[i], i); - } - // We might need to rearrange the columns to match what we want. - DType[] types = schema.getChildTypes(); - String[] neededNames = schema.getColumnNames(); - ColumnView[] columns = new ColumnView[neededNames.length]; - try { - boolean somethingChanged = false; - if (columns.length != foundNames.length) { - somethingChanged = true; - } - for (int i = 0; i < columns.length; i++) { - String neededColumnName = neededNames[i]; - Integer index = indices.get(neededColumnName); - Schema childSchema = schema.getChild(i); - if (index != null) { - if (childSchema.isStructOrHasStructDescendant()) { - ColumnView child = cv.getChildColumnView(index); - boolean shouldCloseChild = true; - try { - if (index != i) { - somethingChanged = true; - } - DidViewChange childResult = gatherJSONColumns(schema.getChild(i), - children.getChild(index), child); - if (childResult.noChangeNeeded) { - shouldCloseChild = false; - columns[i] = child; - } else { - somethingChanged = true; - columns[i] = childResult.changeWasNeeded; - } - } finally { - if (shouldCloseChild) { - child.close(); - } - } - } else { - if (index != i) { - somethingChanged = true; - } - columns[i] = cv.getChildColumnView(index); - } - } else { - somethingChanged = true; - if (types[i] == DType.LIST) { - try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } else if (types[i] == DType.STRUCT) { - int numStructChildren = childSchema.getNumChildren(); - HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren]; - for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) { - structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType(); - } - try (Scalar s = Scalar.structFromNull(structChildren)) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } - } - } - if (somethingChanged) { - try (ColumnView ret = new ColumnView(cv.type, cv.rows, Optional.of(cv.nullCount), - cv.getValid(), null, columns)) { - return DidViewChange.yes(ret.copyToColumnVector()); - } - } else { - return DidViewChange.no(); - } - } finally { - for (ColumnView c: columns) { - if (c != null) { - c.close(); - } - } - } - } - } else if (schema.getType() == DType.LIST && cv.getType() == DType.LIST) { - if (schema.isStructOrHasStructDescendant()) { - String [] childNames = children.getNames(); - if (childNames.length == 2 && - "offsets".equals(childNames[0]) && - "element".equals(childNames[1])) { - try (ColumnView child = cv.getChildColumnView(0)){ - DidViewChange listResult = gatherJSONColumns(schema.getChild(0), - children.getChild(1), child); - if (listResult.noChangeNeeded) { - return DidViewChange.no(); - } else { - try (ColumnView listView = new ColumnView(cv.type, cv.rows, - Optional.of(cv.nullCount), cv.getValid(), cv.getOffsets(), - new ColumnView[]{listResult.changeWasNeeded})) { - return DidViewChange.yes(listView.copyToColumnVector()); - } finally { - listResult.changeWasNeeded.close(); - } - } - } - } - } - // Nothing to change so just return the input, but we need to inc a ref count to really - // make it work, so for now we are going to turn it into a ColumnVector. - return DidViewChange.no(); - } else { - // Nothing to change so just return the input, but we need to inc a ref count to really - // make it work, so for now we are going to turn it into a ColumnVector. - return DidViewChange.no(); - } - } - - private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emptyRowCount) { - String[] neededColumns = schema.getColumnNames(); - if (neededColumns == null || neededColumns.length == 0) { - return twm.releaseTable(); - } else { - String[] foundNames = twm.getColumnNames(); - HashMap indices = new HashMap<>(); - for (int i = 0; i < foundNames.length; i++) { - indices.put(foundNames[i], i); - } - // We might need to rearrange the columns to match what we want. - DType[] types = schema.getChildTypes(); - ColumnVector[] columns = new ColumnVector[neededColumns.length]; - try (Table tbl = twm.releaseTable()) { - int rowCount = tbl == null ? emptyRowCount : (int)tbl.getRowCount(); - if (rowCount < 0) { - throw new IllegalStateException( - "No empty row count provided and the table read has no row count or columns"); - } - for (int i = 0; i < columns.length; i++) { - String neededColumnName = neededColumns[i]; - Integer index = indices.get(neededColumnName); - if (index != null) { - if (schema.getChild(i).isStructOrHasStructDescendant()) { - DidViewChange gathered = gatherJSONColumns(schema.getChild(i), twm.getChild(index), - tbl.getColumn(index)); - if (gathered.noChangeNeeded) { - columns[i] = tbl.getColumn(index).incRefCount(); - } else { - columns[i] = gathered.changeWasNeeded; - } - } else { - columns[i] = tbl.getColumn(index).incRefCount(); - } - } else { - if (types[i] == DType.LIST) { - Schema listSchema = schema.getChild(i); - Schema elementSchema = listSchema.getChild(0); - try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } else if (types[i] == DType.STRUCT) { - Schema structSchema = schema.getChild(i); - int numStructChildren = structSchema.getNumChildren(); - DataType[] structChildrenTypes = new DataType[numStructChildren]; - for (int j = 0; j < numStructChildren; j++) { - structChildrenTypes[j] = structSchema.getChild(j).asHostDataType(); - } - try (Scalar s = Scalar.structFromNull(structChildrenTypes)) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } - } - } - return new Table(columns); - } finally { - for (ColumnVector c: columns) { - if (c != null) { - c.close(); - } - } - } - } - } - /** * Read a JSON file. * @param schema the schema of the file. You may use Schema.INFERRED to infer the schema. @@ -1339,8 +1121,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - - return gatherJSONColumns(schema, twm, -1); + return twm.releaseTable(); } } @@ -1356,23 +1137,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { */ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, HostMemoryAllocator hostMemoryAllocator) { - return readJSON(schema, opts, buffer, offset, len, hostMemoryAllocator, -1); - } - - /** - * Read JSON formatted data. - * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. - * @param opts various JSON parsing options. - * @param buffer raw UTF8 formatted bytes. - * @param offset the starting offset into buffer. - * @param len the number of bytes to parse. - * @param hostMemoryAllocator allocator for host memory buffers - * @param emptyRowCount the number of rows to return if no columns were read. - * @return the data parsed as a table on the GPU. - */ - public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, - long len, HostMemoryAllocator hostMemoryAllocator, - int emptyRowCount) { if (len <= 0) { len = buffer.length - offset; } @@ -1381,16 +1145,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon assert offset >= 0 && offset < buffer.length; try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) { newBuf.setBytes(0, buffer, offset, len); - return readJSON(schema, opts, newBuf, 0, len, emptyRowCount); + return readJSON(schema, opts, newBuf, 0, len); } } - public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, - long len, int emptyRowCount) { - return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get(), - emptyRowCount); - } - public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len) { return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); @@ -1464,22 +1222,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) { * @return the data parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, - long offset, long len) { - return readJSON(schema, opts, buffer, offset, len, -1); - } - - /** - * Read JSON formatted data. - * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. - * @param opts various JSON parsing options. - * @param buffer raw UTF8 formatted bytes. - * @param offset the starting offset into buffer. - * @param len the number of bytes to parse. - * @param emptyRowCount the number of rows to use if no columns were found. - * @return the data parsed as a table on the GPU. - */ - public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, - long offset, long len, int emptyRowCount) { + long offset, long len) { if (len <= 0) { len = buffer.length - offset; } @@ -1508,7 +1251,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - return gatherJSONColumns(schema, twm, emptyRowCount); + return twm.releaseTable(); } } @@ -1520,18 +1263,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b * @return the data parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { - return readJSON(schema, opts, ds, -1); - } - - /** - * Read JSON formatted data. - * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. - * @param opts various JSON parsing options. - * @param ds the DataSource to read from. - * @param emptyRowCount the number of rows to return if no columns were read. - * @return the data parsed as a table on the GPU. - */ - public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); // only prune the schema if one is provided boolean cudfPruneSchema = schema.getColumnNames() != null && @@ -1554,7 +1285,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.experimental(), opts.getLineDelimiter(), dsHandle))) { - return gatherJSONColumns(schema, twm, emptyRowCount); + return twm.releaseTable(); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 0a667978ca3..566ac0b972d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1037,21 +1037,23 @@ cudf::io::schema_element read_schema_element(int& index, if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { std::map child_elems; int num_children = children[index]; + std::vector child_names(num_children); // go to the next entry, so recursion can parse it. index++; for (int i = 0; i < num_children; i++) { - auto const name = std::string{names.get(index).get()}; + auto name = std::string{names.get(index).get()}; child_elems.insert( std::pair{name, cudf::jni::read_schema_element(index, children, names, types, scales)}); + child_names[i] = std::move(name); } - return cudf::io::schema_element{d_type, std::move(child_elems)}; + return cudf::io::schema_element{d_type, std::move(child_elems), {std::move(child_names)}}; } else { if (children[index] != 0) { throw std::invalid_argument("found children for a type that should have none"); } // go to the next entry before returning... index++; - return cudf::io::schema_element{d_type, {}}; + return cudf::io::schema_element{d_type, {}, std::nullopt}; } } @@ -1886,13 +1888,18 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, } std::map data_types; + std::vector name_order; int at = 0; while (at < n_types.size()) { auto const name = std::string{n_col_names.get(at).get()}; data_types.insert(std::pair{ name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)}); + name_order.push_back(name); } - opts.dtypes(data_types); + + cudf::io::schema_element structs{ + cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}}; + opts.dtypes(structs); } else { // should infer the types } @@ -2001,13 +2008,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, } std::map data_types; + std::vector name_order; + name_order.reserve(n_types.size()); int at = 0; while (at < n_types.size()) { - auto const name = std::string{n_col_names.get(at).get()}; + auto name = std::string{n_col_names.get(at).get()}; data_types.insert(std::pair{ name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)}); + name_order.emplace_back(std::move(name)); } - opts.dtypes(data_types); + cudf::io::schema_element structs{ + cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}}; + opts.dtypes(structs); } else { // should infer the types } From 764a7a25cd22ca0adfa788794d010fb4093a0a81 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 25 Oct 2024 08:56:13 -0700 Subject: [PATCH 2/4] Revert "Auxiliary commit to revert individual files from 4d9a9e0bebe3c76c6eb3df3c96a6eef915790af7" This reverts commit a82fdb699a13008b878deaab18ae85a440cf05af. --- java/src/main/java/ai/rapids/cudf/Table.java | 279 ++++++++++++++++++- 1 file changed, 274 insertions(+), 5 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 6bc3082d1d3..dbee53640aa 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1092,6 +1092,224 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) { return readJSON(schema, opts, buffer, 0, buffer.length); } + private static class DidViewChange { + ColumnVector changeWasNeeded = null; + boolean noChangeNeeded = false; + + public static DidViewChange yes(ColumnVector cv) { + DidViewChange ret = new DidViewChange(); + ret.changeWasNeeded = cv; + return ret; + } + + public static DidViewChange no() { + DidViewChange ret = new DidViewChange(); + ret.noChangeNeeded = true; + return ret; + } + } + + private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.NestedChildren children, + ColumnView cv) { + // We need to do this recursively to be sure it all matches as expected. + // If we run into problems where the data types don't match, we are not + // going to fix up the data types. We are only going to reorder the columns. + if (schema.getType() == DType.STRUCT) { + if (cv.getType() != DType.STRUCT) { + // The types don't match so just return the input unchanged... + return DidViewChange.no(); + } else { + String[] foundNames; + if (children == null) { + foundNames = new String[0]; + } else { + foundNames = children.getNames(); + } + HashMap indices = new HashMap<>(); + for (int i = 0; i < foundNames.length; i++) { + indices.put(foundNames[i], i); + } + // We might need to rearrange the columns to match what we want. + DType[] types = schema.getChildTypes(); + String[] neededNames = schema.getColumnNames(); + ColumnView[] columns = new ColumnView[neededNames.length]; + try { + boolean somethingChanged = false; + if (columns.length != foundNames.length) { + somethingChanged = true; + } + for (int i = 0; i < columns.length; i++) { + String neededColumnName = neededNames[i]; + Integer index = indices.get(neededColumnName); + Schema childSchema = schema.getChild(i); + if (index != null) { + if (childSchema.isStructOrHasStructDescendant()) { + ColumnView child = cv.getChildColumnView(index); + boolean shouldCloseChild = true; + try { + if (index != i) { + somethingChanged = true; + } + DidViewChange childResult = gatherJSONColumns(schema.getChild(i), + children.getChild(index), child); + if (childResult.noChangeNeeded) { + shouldCloseChild = false; + columns[i] = child; + } else { + somethingChanged = true; + columns[i] = childResult.changeWasNeeded; + } + } finally { + if (shouldCloseChild) { + child.close(); + } + } + } else { + if (index != i) { + somethingChanged = true; + } + columns[i] = cv.getChildColumnView(index); + } + } else { + somethingChanged = true; + if (types[i] == DType.LIST) { + try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) { + columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); + } + } else if (types[i] == DType.STRUCT) { + int numStructChildren = childSchema.getNumChildren(); + HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren]; + for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) { + structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType(); + } + try (Scalar s = Scalar.structFromNull(structChildren)) { + columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); + } + } else { + try (Scalar s = Scalar.fromNull(types[i])) { + columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); + } + } + } + } + if (somethingChanged) { + try (ColumnView ret = new ColumnView(cv.type, cv.rows, Optional.of(cv.nullCount), + cv.getValid(), null, columns)) { + return DidViewChange.yes(ret.copyToColumnVector()); + } + } else { + return DidViewChange.no(); + } + } finally { + for (ColumnView c: columns) { + if (c != null) { + c.close(); + } + } + } + } + } else if (schema.getType() == DType.LIST && cv.getType() == DType.LIST) { + if (schema.isStructOrHasStructDescendant()) { + String [] childNames = children.getNames(); + if (childNames.length == 2 && + "offsets".equals(childNames[0]) && + "element".equals(childNames[1])) { + try (ColumnView child = cv.getChildColumnView(0)){ + DidViewChange listResult = gatherJSONColumns(schema.getChild(0), + children.getChild(1), child); + if (listResult.noChangeNeeded) { + return DidViewChange.no(); + } else { + try (ColumnView listView = new ColumnView(cv.type, cv.rows, + Optional.of(cv.nullCount), cv.getValid(), cv.getOffsets(), + new ColumnView[]{listResult.changeWasNeeded})) { + return DidViewChange.yes(listView.copyToColumnVector()); + } finally { + listResult.changeWasNeeded.close(); + } + } + } + } + } + // Nothing to change so just return the input, but we need to inc a ref count to really + // make it work, so for now we are going to turn it into a ColumnVector. + return DidViewChange.no(); + } else { + // Nothing to change so just return the input, but we need to inc a ref count to really + // make it work, so for now we are going to turn it into a ColumnVector. + return DidViewChange.no(); + } + } + + private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emptyRowCount) { + String[] neededColumns = schema.getColumnNames(); + if (neededColumns == null || neededColumns.length == 0) { + return twm.releaseTable(); + } else { + String[] foundNames = twm.getColumnNames(); + HashMap indices = new HashMap<>(); + for (int i = 0; i < foundNames.length; i++) { + indices.put(foundNames[i], i); + } + // We might need to rearrange the columns to match what we want. + DType[] types = schema.getChildTypes(); + ColumnVector[] columns = new ColumnVector[neededColumns.length]; + try (Table tbl = twm.releaseTable()) { + int rowCount = tbl == null ? emptyRowCount : (int)tbl.getRowCount(); + if (rowCount < 0) { + throw new IllegalStateException( + "No empty row count provided and the table read has no row count or columns"); + } + for (int i = 0; i < columns.length; i++) { + String neededColumnName = neededColumns[i]; + Integer index = indices.get(neededColumnName); + if (index != null) { + if (schema.getChild(i).isStructOrHasStructDescendant()) { + DidViewChange gathered = gatherJSONColumns(schema.getChild(i), twm.getChild(index), + tbl.getColumn(index)); + if (gathered.noChangeNeeded) { + columns[i] = tbl.getColumn(index).incRefCount(); + } else { + columns[i] = gathered.changeWasNeeded; + } + } else { + columns[i] = tbl.getColumn(index).incRefCount(); + } + } else { + if (types[i] == DType.LIST) { + Schema listSchema = schema.getChild(i); + Schema elementSchema = listSchema.getChild(0); + try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) { + columns[i] = ColumnVector.fromScalar(s, rowCount); + } + } else if (types[i] == DType.STRUCT) { + Schema structSchema = schema.getChild(i); + int numStructChildren = structSchema.getNumChildren(); + DataType[] structChildrenTypes = new DataType[numStructChildren]; + for (int j = 0; j < numStructChildren; j++) { + structChildrenTypes[j] = structSchema.getChild(j).asHostDataType(); + } + try (Scalar s = Scalar.structFromNull(structChildrenTypes)) { + columns[i] = ColumnVector.fromScalar(s, rowCount); + } + } else { + try (Scalar s = Scalar.fromNull(types[i])) { + columns[i] = ColumnVector.fromScalar(s, rowCount); + } + } + } + } + return new Table(columns); + } finally { + for (ColumnVector c: columns) { + if (c != null) { + c.close(); + } + } + } + } + } + /** * Read a JSON file. * @param schema the schema of the file. You may use Schema.INFERRED to infer the schema. @@ -1121,7 +1339,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - return twm.releaseTable(); + + return gatherJSONColumns(schema, twm, -1); } } @@ -1137,6 +1356,23 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { */ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, HostMemoryAllocator hostMemoryAllocator) { + return readJSON(schema, opts, buffer, offset, len, hostMemoryAllocator, -1); + } + + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param buffer raw UTF8 formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @param hostMemoryAllocator allocator for host memory buffers + * @param emptyRowCount the number of rows to return if no columns were read. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, + long len, HostMemoryAllocator hostMemoryAllocator, + int emptyRowCount) { if (len <= 0) { len = buffer.length - offset; } @@ -1145,10 +1381,16 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon assert offset >= 0 && offset < buffer.length; try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) { newBuf.setBytes(0, buffer, offset, len); - return readJSON(schema, opts, newBuf, 0, len); + return readJSON(schema, opts, newBuf, 0, len, emptyRowCount); } } + public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, + long len, int emptyRowCount) { + return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get(), + emptyRowCount); + } + public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len) { return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); @@ -1222,7 +1464,22 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) { * @return the data parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, - long offset, long len) { + long offset, long len) { + return readJSON(schema, opts, buffer, offset, len, -1); + } + + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param buffer raw UTF8 formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @param emptyRowCount the number of rows to use if no columns were found. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, + long offset, long len, int emptyRowCount) { if (len <= 0) { len = buffer.length - offset; } @@ -1251,7 +1508,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - return twm.releaseTable(); + return gatherJSONColumns(schema, twm, emptyRowCount); } } @@ -1263,6 +1520,18 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b * @return the data parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { + return readJSON(schema, opts, ds, -1); + } + + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param ds the DataSource to read from. + * @param emptyRowCount the number of rows to return if no columns were read. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); // only prune the schema if one is provided boolean cudfPruneSchema = schema.getColumnNames() != null && @@ -1285,7 +1554,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { opts.experimental(), opts.getLineDelimiter(), dsHandle))) { - return twm.releaseTable(); + return gatherJSONColumns(schema, twm, emptyRowCount); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); } From 6e978cc3057de3a4973262824614aa049033028e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 25 Oct 2024 10:37:19 -0700 Subject: [PATCH 3/4] Deprecate Java methods Signed-off-by: Nghia Truong --- java/src/main/java/ai/rapids/cudf/Table.java | 245 ++----------------- 1 file changed, 21 insertions(+), 224 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index dbee53640aa..ac531c3c763 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1092,224 +1092,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) { return readJSON(schema, opts, buffer, 0, buffer.length); } - private static class DidViewChange { - ColumnVector changeWasNeeded = null; - boolean noChangeNeeded = false; - - public static DidViewChange yes(ColumnVector cv) { - DidViewChange ret = new DidViewChange(); - ret.changeWasNeeded = cv; - return ret; - } - - public static DidViewChange no() { - DidViewChange ret = new DidViewChange(); - ret.noChangeNeeded = true; - return ret; - } - } - - private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.NestedChildren children, - ColumnView cv) { - // We need to do this recursively to be sure it all matches as expected. - // If we run into problems where the data types don't match, we are not - // going to fix up the data types. We are only going to reorder the columns. - if (schema.getType() == DType.STRUCT) { - if (cv.getType() != DType.STRUCT) { - // The types don't match so just return the input unchanged... - return DidViewChange.no(); - } else { - String[] foundNames; - if (children == null) { - foundNames = new String[0]; - } else { - foundNames = children.getNames(); - } - HashMap indices = new HashMap<>(); - for (int i = 0; i < foundNames.length; i++) { - indices.put(foundNames[i], i); - } - // We might need to rearrange the columns to match what we want. - DType[] types = schema.getChildTypes(); - String[] neededNames = schema.getColumnNames(); - ColumnView[] columns = new ColumnView[neededNames.length]; - try { - boolean somethingChanged = false; - if (columns.length != foundNames.length) { - somethingChanged = true; - } - for (int i = 0; i < columns.length; i++) { - String neededColumnName = neededNames[i]; - Integer index = indices.get(neededColumnName); - Schema childSchema = schema.getChild(i); - if (index != null) { - if (childSchema.isStructOrHasStructDescendant()) { - ColumnView child = cv.getChildColumnView(index); - boolean shouldCloseChild = true; - try { - if (index != i) { - somethingChanged = true; - } - DidViewChange childResult = gatherJSONColumns(schema.getChild(i), - children.getChild(index), child); - if (childResult.noChangeNeeded) { - shouldCloseChild = false; - columns[i] = child; - } else { - somethingChanged = true; - columns[i] = childResult.changeWasNeeded; - } - } finally { - if (shouldCloseChild) { - child.close(); - } - } - } else { - if (index != i) { - somethingChanged = true; - } - columns[i] = cv.getChildColumnView(index); - } - } else { - somethingChanged = true; - if (types[i] == DType.LIST) { - try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } else if (types[i] == DType.STRUCT) { - int numStructChildren = childSchema.getNumChildren(); - HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren]; - for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) { - structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType(); - } - try (Scalar s = Scalar.structFromNull(structChildren)) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount()); - } - } - } - } - if (somethingChanged) { - try (ColumnView ret = new ColumnView(cv.type, cv.rows, Optional.of(cv.nullCount), - cv.getValid(), null, columns)) { - return DidViewChange.yes(ret.copyToColumnVector()); - } - } else { - return DidViewChange.no(); - } - } finally { - for (ColumnView c: columns) { - if (c != null) { - c.close(); - } - } - } - } - } else if (schema.getType() == DType.LIST && cv.getType() == DType.LIST) { - if (schema.isStructOrHasStructDescendant()) { - String [] childNames = children.getNames(); - if (childNames.length == 2 && - "offsets".equals(childNames[0]) && - "element".equals(childNames[1])) { - try (ColumnView child = cv.getChildColumnView(0)){ - DidViewChange listResult = gatherJSONColumns(schema.getChild(0), - children.getChild(1), child); - if (listResult.noChangeNeeded) { - return DidViewChange.no(); - } else { - try (ColumnView listView = new ColumnView(cv.type, cv.rows, - Optional.of(cv.nullCount), cv.getValid(), cv.getOffsets(), - new ColumnView[]{listResult.changeWasNeeded})) { - return DidViewChange.yes(listView.copyToColumnVector()); - } finally { - listResult.changeWasNeeded.close(); - } - } - } - } - } - // Nothing to change so just return the input, but we need to inc a ref count to really - // make it work, so for now we are going to turn it into a ColumnVector. - return DidViewChange.no(); - } else { - // Nothing to change so just return the input, but we need to inc a ref count to really - // make it work, so for now we are going to turn it into a ColumnVector. - return DidViewChange.no(); - } - } - - private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emptyRowCount) { - String[] neededColumns = schema.getColumnNames(); - if (neededColumns == null || neededColumns.length == 0) { - return twm.releaseTable(); - } else { - String[] foundNames = twm.getColumnNames(); - HashMap indices = new HashMap<>(); - for (int i = 0; i < foundNames.length; i++) { - indices.put(foundNames[i], i); - } - // We might need to rearrange the columns to match what we want. - DType[] types = schema.getChildTypes(); - ColumnVector[] columns = new ColumnVector[neededColumns.length]; - try (Table tbl = twm.releaseTable()) { - int rowCount = tbl == null ? emptyRowCount : (int)tbl.getRowCount(); - if (rowCount < 0) { - throw new IllegalStateException( - "No empty row count provided and the table read has no row count or columns"); - } - for (int i = 0; i < columns.length; i++) { - String neededColumnName = neededColumns[i]; - Integer index = indices.get(neededColumnName); - if (index != null) { - if (schema.getChild(i).isStructOrHasStructDescendant()) { - DidViewChange gathered = gatherJSONColumns(schema.getChild(i), twm.getChild(index), - tbl.getColumn(index)); - if (gathered.noChangeNeeded) { - columns[i] = tbl.getColumn(index).incRefCount(); - } else { - columns[i] = gathered.changeWasNeeded; - } - } else { - columns[i] = tbl.getColumn(index).incRefCount(); - } - } else { - if (types[i] == DType.LIST) { - Schema listSchema = schema.getChild(i); - Schema elementSchema = listSchema.getChild(0); - try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } else if (types[i] == DType.STRUCT) { - Schema structSchema = schema.getChild(i); - int numStructChildren = structSchema.getNumChildren(); - DataType[] structChildrenTypes = new DataType[numStructChildren]; - for (int j = 0; j < numStructChildren; j++) { - structChildrenTypes[j] = structSchema.getChild(j).asHostDataType(); - } - try (Scalar s = Scalar.structFromNull(structChildrenTypes)) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, rowCount); - } - } - } - } - return new Table(columns); - } finally { - for (ColumnVector c: columns) { - if (c != null) { - c.close(); - } - } - } - } - } - /** * Read a JSON file. * @param schema the schema of the file. You may use Schema.INFERRED to infer the schema. @@ -1340,7 +1122,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.experimental(), opts.getLineDelimiter()))) { - return gatherJSONColumns(schema, twm, -1); + return twm.releaseTable(); } } @@ -1361,6 +1143,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param buffer raw UTF8 formatted bytes. @@ -1370,6 +1156,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon * @param emptyRowCount the number of rows to return if no columns were read. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, HostMemoryAllocator hostMemoryAllocator, int emptyRowCount) { @@ -1381,14 +1168,14 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon assert offset >= 0 && offset < buffer.length; try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) { newBuf.setBytes(0, buffer, offset, len); - return readJSON(schema, opts, newBuf, 0, len, emptyRowCount); + return readJSON(schema, opts, newBuf, 0, len); } } + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, int emptyRowCount) { - return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get(), - emptyRowCount); + return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); } public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, @@ -1470,6 +1257,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param buffer raw UTF8 formatted bytes. @@ -1478,6 +1269,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b * @param emptyRowCount the number of rows to use if no columns were found. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, long offset, long len, int emptyRowCount) { if (len <= 0) { @@ -1508,7 +1300,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { - return gatherJSONColumns(schema, twm, emptyRowCount); + return twm.releaseTable(); } } @@ -1525,12 +1317,17 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { /** * Read JSON formatted data. + * + * @deprecated This method is deprecated since emptyRowCount is not used. Use the method without + * emptyRowCount instead. + * * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param ds the DataSource to read from. * @param emptyRowCount the number of rows to return if no columns were read. * @return the data parsed as a table on the GPU. */ + @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); // only prune the schema if one is provided @@ -1554,7 +1351,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.experimental(), opts.getLineDelimiter(), dsHandle))) { - return gatherJSONColumns(schema, twm, emptyRowCount); + return twm.releaseTable(); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); } From a28a32a5bcd78922cd94de4bc940270fd9353f4f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 28 Oct 2024 10:53:09 -0700 Subject: [PATCH 4/4] Always prune columns if schema is available Signed-off-by: Nghia Truong --- java/src/main/java/ai/rapids/cudf/Table.java | 17 ----------------- java/src/main/native/src/TableJni.cpp | 14 +++++++------- 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index ac531c3c763..b01ce31b1f3 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -259,7 +259,6 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, - boolean pruneColumns, boolean experimental, byte lineDelimiter) throws CudfException; @@ -275,7 +274,6 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, - boolean pruneColumns, boolean experimental, byte lineDelimiter, long dsHandle) throws CudfException; @@ -1100,10 +1098,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) { * @return the file parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, File path) { - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta( readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), @@ -1118,7 +1112,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { @@ -1278,10 +1271,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b assert len > 0; assert len <= buffer.length - offset; assert offset >= 0 && offset < buffer.length; - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSON( schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null, @@ -1297,7 +1286,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter()))) { return twm.releaseTable(); @@ -1330,10 +1318,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { @SuppressWarnings("unused") public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); - // only prune the schema if one is provided - boolean cudfPruneSchema = schema.getColumnNames() != null && - schema.getColumnNames().length != 0 && - opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), @@ -1347,7 +1331,6 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), - cudfPruneSchema, opts.experimental(), opts.getLineDelimiter(), dsHandle))) { diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 566ac0b972d..1f8b1ea207d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1826,7 +1826,6 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, - jboolean prune_columns, jboolean experimental, jbyte line_delimiter, jlong ds_handle) @@ -1855,6 +1854,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) @@ -1866,7 +1866,6 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns) .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -1896,10 +1895,11 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)}); name_order.push_back(name); } - + auto const prune_columns = data_types.size() != 0; cudf::io::schema_element structs{ cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}}; - opts.dtypes(structs); + opts.prune_columns(prune_columns).dtypes(structs); + } else { // should infer the types } @@ -1932,7 +1932,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, - jboolean prune_columns, jboolean experimental, jbyte line_delimiter) { @@ -1975,6 +1974,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, cudf::io::json_recovery_mode_t recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) @@ -1986,7 +1986,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns) .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -2017,9 +2016,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)}); name_order.emplace_back(std::move(name)); } + auto const prune_columns = data_types.size() != 0; cudf::io::schema_element structs{ cudf::data_type{cudf::type_id::STRUCT}, std::move(data_types), {std::move(name_order)}}; - opts.dtypes(structs); + opts.prune_columns(prune_columns).dtypes(structs); } else { // should infer the types }