diff --git a/cpp/include/gar/fwd.h b/cpp/include/gar/fwd.h index d830f696b..085fd9c38 100644 --- a/cpp/include/gar/fwd.h +++ b/cpp/include/gar/fwd.h @@ -73,6 +73,9 @@ class DataType; enum FileType { CSV = 0, PARQUET = 1, ORC = 2 }; enum class AdjListType : uint8_t; +template +class Array; + class InfoVersion; class Property; @@ -120,6 +123,7 @@ const std::shared_ptr& int64(); const std::shared_ptr& float32(); const std::shared_ptr& float64(); const std::shared_ptr& string(); +std::shared_ptr list(const std::shared_ptr& value_type); namespace util { struct FilterOptions; diff --git a/cpp/include/gar/graph.h b/cpp/include/gar/graph.h index f8d77317b..e8d90b423 100644 --- a/cpp/include/gar/graph.h +++ b/cpp/include/gar/graph.h @@ -36,7 +36,8 @@ // forward declarations namespace arrow { class ChunkedArray; -} +class Array; +} // namespace arrow namespace GAR_NAMESPACE_INTERNAL { @@ -69,23 +70,12 @@ class Vertex { * @return Result: The property value or error. */ template - inline Result property(const std::string& property) noexcept { - if (properties_.find(property) == properties_.end()) { - return Status::KeyError("Property with name ", property, - " does not exist in the vertex."); - } - try { - T ret = std::any_cast(properties_[property]); - return ret; - } catch (const std::bad_any_cast& e) { - return Status::TypeError("Any cast failed, the property type of ", - property, " is not matched ", e.what()); - } - } + Result property(const std::string& property) const; private: IdType id_; std::map properties_; + std::map> list_properties_; }; /** @@ -124,23 +114,12 @@ class Edge { * @return Result: The property value or error. */ template - inline Result property(const std::string& property) noexcept { - if (properties_.find(property) == properties_.end()) { - return Status::KeyError("Property with name ", property, - " does not exist in the edge."); - } - try { - T ret = std::any_cast(properties_[property]); - return ret; - } catch (const std::bad_any_cast& e) { - return Status::TypeError("Any cast failed, the property type of ", - property, " is not matched ", e.what()); - } - } + Result property(const std::string& property) const; private: IdType src_id_, dst_id_; std::map properties_; + std::map> list_properties_; }; /** diff --git a/cpp/include/gar/util/convert_to_arrow_type.h b/cpp/include/gar/util/convert_to_arrow_type.h index 5ec70385b..ff3b1dd40 100644 --- a/cpp/include/gar/util/convert_to_arrow_type.h +++ b/cpp/include/gar/util/convert_to_arrow_type.h @@ -28,13 +28,25 @@ namespace GAR_NAMESPACE_INTERNAL { /** Struct to convert DataType to arrow::DataType. */ +template +struct CTypeToArrowType {}; + template -struct ConvertToArrowType {}; +struct TypeToArrowType {}; #define CONVERT_TO_ARROW_TYPE(type, c_type, arrow_type, array_type, \ builder_type, type_value, str) \ template <> \ - struct ConvertToArrowType { \ + struct TypeToArrowType { \ + using CType = c_type; \ + using ArrowType = arrow_type; \ + using ArrayType = array_type; \ + using BuilderType = builder_type; \ + static std::shared_ptr TypeValue() { return type_value; } \ + static const char* type_to_string() { return str; } \ + }; \ + template <> \ + struct CTypeToArrowType { \ using CType = c_type; \ using ArrowType = arrow_type; \ using ArrayType = array_type; \ diff --git a/cpp/include/gar/util/data_type.h b/cpp/include/gar/util/data_type.h index 2a70920d4..d7bcf6085 100644 --- a/cpp/include/gar/util/data_type.h +++ b/cpp/include/gar/util/data_type.h @@ -51,6 +51,9 @@ enum class Type { /** UTF8 variable-length string */ STRING, + /** List of some logical data type */ + LIST, + /** User-defined data type */ USER_DEFINED, @@ -67,14 +70,21 @@ class DataType { DataType() : id_(Type::BOOL) {} explicit DataType(Type id, const std::string& user_defined_type_name = "") - : id_(id), user_defined_type_name_(user_defined_type_name) {} + : id_(id), + child_(nullptr), + user_defined_type_name_(user_defined_type_name) {} + + explicit DataType(Type id, const std::shared_ptr& child) + : id_(id), child_(std::move(child)), user_defined_type_name_("") {} DataType(const DataType& other) : id_(other.id_), + child_(other.child_), user_defined_type_name_(other.user_defined_type_name_) {} explicit DataType(DataType&& other) : id_(other.id_), + child_(std::move(other.child_)), user_defined_type_name_(std::move(other.user_defined_type_name_)) {} inline DataType& operator=(const DataType& other) = default; @@ -91,6 +101,8 @@ class DataType { return Equals(*other.get()); } + const std::shared_ptr& value_type() const { return child_; } + bool operator==(const DataType& other) const { return Equals(other); } bool operator!=(const DataType& other) const { return !Equals(other); } @@ -98,11 +110,10 @@ class DataType { static std::shared_ptr DataTypeToArrowDataType( const std::shared_ptr& type); - static const std::shared_ptr& ArrowDataTypeToDataType( + static std::shared_ptr ArrowDataTypeToDataType( const std::shared_ptr& type); - static const std::shared_ptr& TypeNameToDataType( - const std::string& str); + static std::shared_ptr TypeNameToDataType(const std::string& str); /** Return the type category of the DataType. */ Type id() const { return id_; } @@ -111,6 +122,7 @@ class DataType { private: Type id_; + std::shared_ptr child_; std::string user_defined_type_name_; }; // struct DataType } // namespace GAR_NAMESPACE_INTERNAL diff --git a/cpp/include/gar/util/util.h b/cpp/include/gar/util/util.h index bfc8cdfe1..e2ea6fdd9 100644 --- a/cpp/include/gar/util/util.h +++ b/cpp/include/gar/util/util.h @@ -126,5 +126,49 @@ struct ValueGetter { } // namespace util +template +class Array final { + public: + using ValueType = T; + Array() : data_(nullptr), size_(0) {} + Array(const T* data, size_t size) : data_(data), size_(size) {} + Array(const Array& other) = default; + Array(Array&& other) = default; + Array& operator=(const Array& other) = default; + Array& operator=(Array&& other) = default; + ~Array() = default; + + const T& operator[](size_t index) const { return data_[index]; } + + const T* data() const { return data_; } + + size_t size() const { return size_; } + + void clear() { + data_ = nullptr; + size_ = 0; + } + + bool empty() const { return size_ == 0; } + + void swap(Array& other) { + std::swap(data_, other.data_); + std::swap(size_, other.size_); + } + + const T* begin() { return data_; } + + const T* end() { return data_ + size_; } + + private: + const T* data_; + size_t size_; +}; + +using Int32Array = Array; +using Int64Array = Array; +using FloatArray = Array; +using DoubleArray = Array; + } // namespace GAR_NAMESPACE_INTERNAL #endif // GAR_UTIL_UTIL_H_ diff --git a/cpp/include/gar/writer/edges_builder.h b/cpp/include/gar/writer/edges_builder.h index f4ff28981..7d251f380 100644 --- a/cpp/include/gar/writer/edges_builder.h +++ b/cpp/include/gar/writer/edges_builder.h @@ -79,6 +79,7 @@ class Edge { * @param name The name of the property. * @param val The value of the property. */ + // TODO(@acezen): Enable the property to be a vector(list). inline void AddProperty(const std::string& name, const std::any& val) { empty_ = false; properties_[name] = val; diff --git a/cpp/include/gar/writer/vertices_builder.h b/cpp/include/gar/writer/vertices_builder.h index 1b70ac6b9..e5f4cb21a 100644 --- a/cpp/include/gar/writer/vertices_builder.h +++ b/cpp/include/gar/writer/vertices_builder.h @@ -79,6 +79,7 @@ class Vertex { * @param name The name of the property. * @param val The value of the property. */ + // TODO(@acezen): Enable the property to be a vector(list). inline void AddProperty(const std::string& name, const std::any& val) { empty_ = false; properties_[name] = val; diff --git a/cpp/src/data_type.cc b/cpp/src/data_type.cc index b4c4e24d2..a9df31740 100644 --- a/cpp/src/data_type.cc +++ b/cpp/src/data_type.cc @@ -39,12 +39,14 @@ std::shared_ptr DataType::DataTypeToArrowDataType( return arrow::float64(); case Type::STRING: return arrow::large_utf8(); + case Type::LIST: + return arrow::list(DataTypeToArrowDataType(type->child_)); default: throw std::runtime_error("Unsupported data type"); } } -const std::shared_ptr& DataType::ArrowDataTypeToDataType( +std::shared_ptr DataType::ArrowDataTypeToDataType( const std::shared_ptr& type) { switch (type->id()) { case arrow::Type::BOOL: @@ -61,6 +63,8 @@ const std::shared_ptr& DataType::ArrowDataTypeToDataType( return string(); case arrow::Type::LARGE_STRING: return string(); + case arrow::Type::LIST: + return list(ArrowDataTypeToDataType(type->field(0)->type())); default: throw std::runtime_error("Unsupported data type"); } @@ -85,13 +89,14 @@ std::string DataType::ToTypeName() const { #undef TO_STRING_CASE case Type::USER_DEFINED: return user_defined_type_name_; + case Type::LIST: + return "list<" + child_->ToTypeName() + ">"; default: return "unknown"; } } -const std::shared_ptr& DataType::TypeNameToDataType( - const std::string& str) { +std::shared_ptr DataType::TypeNameToDataType(const std::string& str) { if (str == "bool") { return boolean(); } else if (str == "int32") { @@ -104,6 +109,14 @@ const std::shared_ptr& DataType::TypeNameToDataType( return float64(); } else if (str == "string") { return string(); + } else if (str == "list") { + return list(int32()); + } else if (str == "list") { + return list(int64()); + } else if (str == "list") { + return list(float32()); + } else if (str == "list") { + return list(float64()); } else { throw std::runtime_error("Unsupported data type " + str); } @@ -123,4 +136,7 @@ TYPE_FACTORY(float32, Type::FLOAT) TYPE_FACTORY(float64, Type::DOUBLE) TYPE_FACTORY(string, Type::STRING) +std::shared_ptr list(const std::shared_ptr& value_type) { + return std::make_shared(Type::LIST, value_type); +} } // namespace GAR_NAMESPACE_INTERNAL diff --git a/cpp/src/edges_builder.cc b/cpp/src/edges_builder.cc index 9840e53b7..3b6361e54 100644 --- a/cpp/src/edges_builder.cc +++ b/cpp/src/edges_builder.cc @@ -118,37 +118,37 @@ Status EdgesBuilder::validate(const Edge& e, switch (type->id()) { case Type::BOOL: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; case Type::INT32: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; case Type::INT64: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; case Type::FLOAT: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; case Type::DOUBLE: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; case Type::STRING: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; @@ -193,9 +193,9 @@ Status EdgesBuilder::tryToAppend( const std::string& property_name, std::shared_ptr& array, // NOLINT const std::vector& edges) { - using CType = typename ConvertToArrowType::CType; + using CType = typename TypeToArrowType::CType; arrow::MemoryPool* pool = arrow::default_memory_pool(); - typename ConvertToArrowType::BuilderType builder(pool); + typename TypeToArrowType::BuilderType builder(pool); for (const auto& e : edges) { if (e.Empty() || (!e.ContainProperty(property_name))) { RETURN_NOT_ARROW_OK(builder.AppendNull()); diff --git a/cpp/src/graph.cc b/cpp/src/graph.cc index fc2cd1071..753ccb74b 100644 --- a/cpp/src/graph.cc +++ b/cpp/src/graph.cc @@ -23,7 +23,7 @@ namespace GAR_NAMESPACE_INTERNAL { template Status CastToAny(std::shared_ptr array, std::any& any) { // NOLINT - using ArrayType = typename ConvertToArrowType::ArrayType; + using ArrayType = typename TypeToArrowType::ArrayType; auto column = std::dynamic_pointer_cast(array); any = column->GetView(0); return Status::OK(); @@ -32,7 +32,7 @@ Status CastToAny(std::shared_ptr array, template <> Status CastToAny(std::shared_ptr array, std::any& any) { // NOLINT - using ArrayType = typename ConvertToArrowType::ArrayType; + using ArrayType = typename TypeToArrowType::ArrayType; auto column = std::dynamic_pointer_cast(array); any = column->GetString(0); return Status::OK(); @@ -69,9 +69,44 @@ Vertex::Vertex(IdType id, auto schema = chunk_table->schema(); for (int i = 0; i < schema->num_fields(); ++i) { auto field = chunk_table->field(i); - auto type = DataType::ArrowDataTypeToDataType(field->type()); - GAR_RAISE_ERROR_NOT_OK(TryToCastToAny( - type, chunk_table->column(i)->chunk(0), properties_[field->name()])); + if (field->type()->id() == arrow::Type::LIST) { + auto list_array = std::dynamic_pointer_cast( + chunk_table->column(i)->chunk(0)); + list_properties_[field->name()] = list_array->value_slice(0); + } else { + auto type = DataType::ArrowDataTypeToDataType(field->type()); + GAR_RAISE_ERROR_NOT_OK(TryToCastToAny(type, + chunk_table->column(i)->chunk(0), + properties_[field->name()])); + } + } + } +} + +template +Result Vertex::property(const std::string& property) const { + if constexpr (std::is_final::value) { + auto it = list_properties_.find(property); + if (it == list_properties_.end()) { + return Status::KeyError("The list property ", property, + " doesn't exist."); + } + auto array = std::dynamic_pointer_cast< + typename CTypeToArrowType::ArrayType>( + it->second); + const typename T::ValueType* values = array->raw_values(); + return T(values, array->length()); + } else { + if (properties_.find(property) == properties_.end()) { + return Status::KeyError("Property with name ", property, + " does not exist in the vertex."); + } + try { + T ret = std::any_cast(properties_.at(property)); + return ret; + } catch (const std::bad_any_cast& e) { + return Status::TypeError("Any cast failed, the property type of ", + property, " is not matched ", e.what()); } } } @@ -94,13 +129,67 @@ Edge::Edge( auto schema = chunk_table->schema(); for (int i = 0; i < schema->num_fields(); ++i) { auto field = chunk_table->field(i); - auto type = DataType::ArrowDataTypeToDataType(field->type()); - GAR_RAISE_ERROR_NOT_OK(TryToCastToAny( - type, chunk_table->column(i)->chunk(0), properties_[field->name()])); + if (field->type()->id() == arrow::Type::LIST) { + auto list_array = std::dynamic_pointer_cast( + chunk_table->column(i)->chunk(0)); + list_properties_[field->name()] = list_array->value_slice(0); + } else { + auto type = DataType::ArrowDataTypeToDataType(field->type()); + GAR_RAISE_ERROR_NOT_OK(TryToCastToAny(type, + chunk_table->column(i)->chunk(0), + properties_[field->name()])); + } + } + } +} + +template +Result Edge::property(const std::string& property) const { + if constexpr (std::is_final::value) { + auto it = list_properties_.find(property); + if (it == list_properties_.end()) { + return Status::KeyError("The list property ", property, + " doesn't exist."); + } + auto array = std::dynamic_pointer_cast< + typename CTypeToArrowType::ArrayType>( + it->second); + const typename T::ValueType* values = array->raw_values(); + return T(values, array->length()); + } else { + if (properties_.find(property) == properties_.end()) { + return Status::KeyError("Property with name ", property, + " does not exist in the vertex."); + } + try { + T ret = std::any_cast(properties_.at(property)); + return ret; + } catch (const std::bad_any_cast& e) { + return Status::TypeError("Any cast failed, the property type of ", + property, " is not matched ", e.what()); } } } +#define INSTANTIATE_PROPERTY(T) \ + template Result Vertex::property(const std::string& name) const; \ + template Result Edge::property(const std::string& name) const; + +INSTANTIATE_PROPERTY(int32_t) +INSTANTIATE_PROPERTY(const int32_t&) +INSTANTIATE_PROPERTY(Int32Array) +INSTANTIATE_PROPERTY(int64_t) +INSTANTIATE_PROPERTY(const int64_t&) +INSTANTIATE_PROPERTY(Int64Array) +INSTANTIATE_PROPERTY(float) +INSTANTIATE_PROPERTY(const float&) +INSTANTIATE_PROPERTY(FloatArray) +INSTANTIATE_PROPERTY(double) +INSTANTIATE_PROPERTY(const double&) +INSTANTIATE_PROPERTY(DoubleArray) +INSTANTIATE_PROPERTY(std::string) +INSTANTIATE_PROPERTY(const std::string&) + IdType EdgeIter::source() { adj_list_reader_.seek(cur_offset_); GAR_ASSIGN_OR_RAISE_ERROR(auto chunk, adj_list_reader_.GetChunk()); diff --git a/cpp/src/graph_info.cc b/cpp/src/graph_info.cc index a77ca1ce0..0ab3cfab0 100644 --- a/cpp/src/graph_info.cc +++ b/cpp/src/graph_info.cc @@ -122,6 +122,10 @@ bool PropertyGroup::IsValidated() const { } else { check_property_unique_set.insert(p.name); } + if (p.type->id() == Type::LIST && file_type_ == FileType::CSV) { + // list type is not supported in csv file + return false; + } } return true; } diff --git a/cpp/src/vertices_builder.cc b/cpp/src/vertices_builder.cc index b290b0ee5..c6eb55bf6 100644 --- a/cpp/src/vertices_builder.cc +++ b/cpp/src/vertices_builder.cc @@ -65,37 +65,37 @@ Status VerticesBuilder::validate(const Vertex& v, IdType index, switch (type->id()) { case Type::BOOL: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; case Type::INT32: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; case Type::INT64: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; case Type::FLOAT: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; case Type::DOUBLE: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; case Type::STRING: if (property.second.type() != - typeid(typename ConvertToArrowType::CType)) { + typeid(typename TypeToArrowType::CType)) { invalid_type = true; } break; @@ -138,9 +138,9 @@ template Status VerticesBuilder::tryToAppend( const std::string& property_name, std::shared_ptr& array) { // NOLINT - using CType = typename ConvertToArrowType::CType; + using CType = typename TypeToArrowType::CType; arrow::MemoryPool* pool = arrow::default_memory_pool(); - typename ConvertToArrowType::BuilderType builder(pool); + typename TypeToArrowType::BuilderType builder(pool); for (auto& v : vertices_) { if (v.Empty() || !v.ContainProperty(property_name)) { RETURN_NOT_ARROW_OK(builder.AppendNull()); diff --git a/cpp/test/test_graph.cc b/cpp/test/test_graph.cc index 5647ac73f..e179bf6ed 100644 --- a/cpp/test/test_graph.cc +++ b/cpp/test/test_graph.cc @@ -18,6 +18,7 @@ #include "./util.h" #include "gar/graph.h" +#include "gar/util/data_type.h" #define CATCH_CONFIG_MAIN #include @@ -80,6 +81,37 @@ TEST_CASE("Graph") { it_begin.property("id").value()); } + SECTION("ListProperty") { + // read file and construct graph info + std::string path = + root + "/ldbc_sample/parquet/ldbc_sample_with_feature.graph.yml"; + auto maybe_graph_info = GraphInfo::Load(path); + REQUIRE(maybe_graph_info.status().ok()); + auto graph_info = maybe_graph_info.value(); + std::string label = "person", list_property = "feature"; + auto maybe_vertices_collection = + VerticesCollection::Make(graph_info, label); + REQUIRE(!maybe_vertices_collection.has_error()); + auto vertices = maybe_vertices_collection.value(); + auto count = 0; + auto vertex_info = graph_info->GetVertexInfo(label); + auto data_type = vertex_info->GetPropertyType(list_property).value(); + REQUIRE(data_type->id() == Type::LIST); + REQUIRE(data_type->value_type()->id() == Type::FLOAT); + if (data_type->id() == Type::LIST && + data_type->value_type()->id() == Type::FLOAT) { + for (auto it = vertices->begin(); it != vertices->end(); ++it) { + auto vertex = *it; + auto float_array = vertex.property(list_property).value(); + for (size_t i = 0; i < float_array.size(); i++) { + REQUIRE(float_array[i] == static_cast(vertex.id()) + i); + } + count++; + } + REQUIRE(count == 903); + } + } + SECTION("EdgesCollection") { std::string src_label = "person", edge_label = "knows", dst_label = "person"; diff --git a/docs/file-format.rst b/docs/file-format.rst index fb3be40d9..f8d384df1 100644 --- a/docs/file-format.rst +++ b/docs/file-format.rst @@ -164,6 +164,7 @@ GraphAr provides a set of built-in data types that are common in real use cases - float - double - string +- list (of int32, int64, float, double; not supported by CSV) .. tip:: diff --git a/spark/src/main/scala/com/alibaba/graphar/GraphInfo.scala b/spark/src/main/scala/com/alibaba/graphar/GraphInfo.scala index d2f51db2d..6337a90f9 100644 --- a/spark/src/main/scala/com/alibaba/graphar/GraphInfo.scala +++ b/spark/src/main/scala/com/alibaba/graphar/GraphInfo.scala @@ -44,8 +44,8 @@ object GarType extends Enumeration { /** UTF8 variable-length string */ val STRING = Value(6) - /** Array of same type */ - val ARRAY = Value(7) + /** List of same type */ + val LIST = Value(7) /** * Data type in gar to string. @@ -62,7 +62,7 @@ object GarType extends Enumeration { case GarType.FLOAT => "float" case GarType.DOUBLE => "double" case GarType.STRING => "string" - case GarType.ARRAY => "array" + case GarType.LIST => "list" case _ => throw new IllegalArgumentException("Unknown data type") } @@ -81,7 +81,7 @@ object GarType extends Enumeration { case "float" => GarType.FLOAT case "double" => GarType.DOUBLE case "string" => GarType.STRING - case "array" => GarType.ARRAY + case "list" => GarType.LIST case _ => throw new IllegalArgumentException("Unknown data type: " + str) } } diff --git a/testing b/testing index e8995b5fb..5c848db87 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit e8995b5fb4e4534744beee119acffc1e142b7891 +Subproject commit 5c848db87aabefc783f6481bfd857e216ab5ec89