[GPU] Initial RoPE version for ChatGLM

Lyamin-Roman · Jan 25, 2024 · 3c418e1 · 3c418e1
1 parent 639c155
commit 3c418e1
Show file tree

Hide file tree

Showing 18 changed files with 5,330 additions and 1 deletion.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/rope.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/rope.hpp
@@ -0,0 +1,104 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/core/node.hpp"
+#include "openvino/op/op.hpp"
+
+namespace ov {
+namespace intel_gpu {
+namespace op {
+
+/**
+ * The operation performs rotary positional embedding operation described in:
+ *   ROFORMER: ENHANCED TRANSFORMER WITH ROTARY POSITION EMBEDDING by Jianlin Su
+ *
+ *  the core computation is application of 2x2 rotation matrix on basis
+ *  of pair of input states x[i0] & x[i1] to get the rotary embedded pair of output
+ *  states y[i0] and y[i1]:
+ *
+ *  suppose dimension of hidden states (of each attention head) is N and d of which
+ *  are to be embedded (d <= N), non-embedded parts are copied into output.
+ *
+ *  for i in 0...(d/2)
+ *      if (is_interleaved) {
+ *          // interleaving style of indexing
+ *          i0 = i*2
+ *          i1 = i*2 + 1
+ *      } else {
+ *          // rotate-half style of indexing
+ *          i0 = i
+ *          i1 = i + (d/2)
+ *      }
+ *      y[i0] = x[i0]*cos(m * xita[i]) - x[i1]*sin(m * xita[i])
+ *      y[i1] = x[i1]*cos(m * xita[i]) + x[i0]*sin(m * xita[i])
+ *  Note: m is token position of current input
+ *
+ *  based on configuration, additional preprocessing steps maybe performed as well:
+ *      - slicing last dimension of input tensor
+ *          (when q/k/v are merged and only q or k part is to be extracted & embedded)
+ *      - transpose input tensor
+ *          (when q/k comes from fullyconnect has layout [batch, seq_len, head_cnt, head_dim]
+ *           but output of RoPE is required to be of layout [batch, head_cnt, seq_length, head_dims])
+ *      - gather sin/cos from input tensor 2&3 using position index tensor passed through input 4
+ *
+ * Inputs:
+ *     1. Input hidden states tensor of type T1 - shape:
+ *           [batch, seq_length, head_cnt, head_dims] when input_trans0213 == false OR
+ *           [batch, head_cnt, seq_length, head_dims] when input_trans0213 == true
+ *     2. pre-calculated cos(m*xita[n]) tensor of type T2 - shape [1, 1, max_position_embeddings, d].
+ *     3. pre-calculated sin(m*xita[n]) tensor of type T2 - shape [1, 1, max_position_embeddings, d].
+ *        input 3 is combined with 2 when is_interleaved is true.
+ *     4. postion index tensor of type T3 - shape [batch, 1, seq_length, 1 or d] OR [batch, seq_length] optional
+ * Outputs:
+ *     1. New embedding tensor of type T1 and of shape [batch, head_cnt, seq_length, head_dims]
+ * Types:
+ *     T1 - FP32 or BF16
+ *     T2 - FP32
+ *     T3 - I32
+ */
+class RoPE : public ov::op::Op {
+public:
+    OPENVINO_OP("RoPE", "gpu_opset");
+
+    RoPE() = default;
+
+    struct Config {
+        size_t slice_start = 0;          // slice inner-most dimensions of input
+        size_t slice_stop = 0;
+        bool input_trans0213 = false;    // transpose input dim 1&2
+        bool is_interleaved = false;     // interleaved mode, implies trans0213 happens after RoPE
+        size_t rotary_ndims = 0;         // dimensions to be embedded (d in the description)
+        bool is_chatglm = false;         // chatglm is special which overrides other setting
+        bool is_qwen = false;            // Qwen is special which overrides other setting
+        size_t head_cnt = 0;
+        size_t head_size = 0;
+        int gather_position_arg_id = 0;  // arg id of position tensor,
+                                         // == 3 when gather from sin/cos inputs according to position is required
+    };
+
+    RoPE(const OutputVector& args, const Config& cfg);
+
+    bool visit_attributes(ov::AttributeVisitor& visitor) override;
+
+    void validate_and_infer_types() override;
+
+    std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+
+    const Config& get_config() const {
+        return m_config;
+    }
+
+    Config& get_config() {
+        return m_config;
+    }
+
+private:
+    Config m_config;
+};
+
+}  // namespace op
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
@@ -275,3 +275,4 @@ REGISTER_FACTORY(internal, RMS);
 REGISTER_FACTORY(internal, GatherCompressed);
 REGISTER_FACTORY(internal, KVCache);
 REGISTER_FACTORY(internal, ReadValue);
+REGISTER_FACTORY(internal, RoPE);
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/rope.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/rope.hpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "primitive.hpp"
+#include "intel_gpu/op/rope.hpp"
+
+namespace cldnn {
+using RoPE = ov::intel_gpu::op::RoPE;
+
+/// @brief Rotary Position Embedding primitive
+struct rope : public primitive_base<rope> {
+    CLDNN_DECLARE_PRIMITIVE(rope);
+
+    rope() : primitive_base("", {}) {}
+
+    /// @brief Constructs rope primitive
+    /// @param id This primitive id
+    /// @param inputs Inputs primitive id
+    /// @param config
+    rope(const primitive_id& id,
+         const std::vector<input_info>& inputs,
+         const RoPE::Config& config,
+         const padding& output_padding = padding())
+        : primitive_base(id, inputs, {output_padding}),
+          config(config) {}
+
+    /// @brief
+    RoPE::Config config;
+
+    size_t hash() const override {
+        size_t seed = primitive::hash();
+        seed = hash_combine(seed, config.gather_position_arg_id);
+        seed = hash_combine(seed, config.head_cnt);
+        seed = hash_combine(seed, config.head_size);
+        seed = hash_combine(seed, config.input_trans0213);
+        seed = hash_combine(seed, config.is_chatglm);
+        seed = hash_combine(seed, config.is_interleaved);
+        seed = hash_combine(seed, config.is_qwen);
+        seed = hash_combine(seed, config.rotary_ndims);
+        seed = hash_combine(seed, config.slice_start);
+        seed = hash_combine(seed, config.slice_stop);
+        return seed;
+    }
+
+    bool operator==(const primitive& rhs) const override {
+        if (!compare_common_params(rhs))
+            return false;
+
+        auto rhs_casted = downcast<const rope>(rhs);
+
+        return config.gather_position_arg_id == rhs_casted.config.gather_position_arg_id; //TODO
+    }
+
+    void save(BinaryOutputBuffer& ob) const override {
+        primitive_base<rope>::save(ob);
+        ob << config.gather_position_arg_id; //TODO
+    }
+
+    void load(BinaryInputBuffer& ib) override {
+        primitive_base<rope>::load(ib);
+        ib >> config.gather_position_arg_id; //TODO
+    }
+};
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp
@@ -29,6 +29,7 @@ void register_implementations() {
     REGISTER_CPU(broadcast);
     REGISTER_CPU(tile);
     REGISTER_CPU(select);
+    REGISTER_CPU(rope);
 }
 
 }  // namespace cpu

diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp
@@ -22,6 +22,7 @@
 #include "intel_gpu/primitives/broadcast.hpp"
 #include "intel_gpu/primitives/tile.hpp"
 #include "intel_gpu/primitives/select.hpp"
+#include "intel_gpu/primitives/rope.hpp"
 
 namespace cldnn {
 namespace cpu {
@@ -53,6 +54,7 @@ REGISTER_CPU(reorder);
 REGISTER_CPU(broadcast);
 REGISTER_CPU(tile);
 REGISTER_CPU(select);
+REGISTER_CPU(rope);
 
 #undef REGISTER_CPU