Merge remote-tracking branch 'upstream/master' into paged-attention

ilya-lavrenov · Feb 12, 2024 · b36e57a · b36e57a
2 parents 980fabc + 468701a
commit b36e57a
Show file tree

Hide file tree

Showing 14 changed files with 117 additions and 102 deletions.
diff --git a/modules/custom_operations/user_ie_extensions/paged_attention/cache_impl.cpp b/modules/custom_operations/user_ie_extensions/paged_attention/cache_impl.cpp
@@ -58,9 +58,9 @@ void reshape_and_cache_cpu_impl(
 }
 }; // namespace
 
-void reshape_and_cache(ov::Tensor key, ov::Tensor value,
-                       ov::Tensor key_cache, ov::Tensor value_cache,
-                       ov::Tensor slot_mapping) {
+void reshape_and_cache_cpu(ov::Tensor key, ov::Tensor value,
+                           ov::Tensor key_cache, ov::Tensor value_cache,
+                           ov::Tensor slot_mapping) {
   ov::Shape key_shape = key.get_shape(), key_cache_shape = key_cache.get_shape();
   int num_tokens = key_shape[0];
   int num_heads = key_shape[1];

diff --git a/modules/custom_operations/user_ie_extensions/paged_attention/paged_attention.cpp b/modules/custom_operations/user_ie_extensions/paged_attention/paged_attention.cpp
@@ -49,8 +49,7 @@ TemplateExtension::PagedAttention::PagedAttention(const ov::OutputVector& inputs
     // compile model for prefill stage
     std::call_once(m_once, [_this=this] () {
         ov::Core core;
-        core.register_plugin("/mnt/data3_1878/ilya/Documents/Programming/git_repo/openvino/bin/intel64/Release/libopenvino_intel_cpu_plugin.so", "CPU2");
-        auto compiled_model = core.compile_model(make_prefill_subgraph(), "CPU2");
+        auto compiled_model = core.compile_model(make_prefill_subgraph(), "CPU");
         _this->m_prefill_request = compiled_model.create_infer_request();
     });
 }

diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/utils.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/utils.py
@@ -86,9 +86,12 @@ def greedy_decoder(input) -> Model:
     return token_ids.output(0)
 
 
-def add_greedy_decoding(text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME) -> Model:
+def add_greedy_decoding(
+        text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
+) -> Model:
     ppp = PrePostProcessor(text_generation_model)
     ppp.output(logits_output).postprocess().custom(greedy_decoder)
+    ppp.output(logits_output).tensor().set_element_type(output_type)
     model = ppp.build()
     model.output(logits_output).tensor.set_names({TOKEN_IDS_OUTPUT_NAME})
     return model

diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp
@@ -91,26 +91,21 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s
 }
 
 void SentencepieceTokenizer::validate_and_infer_types() {
-
-    #if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS
-
-    FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");
-
-    #else
-
-    FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences");
     FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
 
-    FRONT_END_GENERAL_CHECK(
-        // WA: sometimes f32 appeared as a placeholder for unknown type
-        get_input_element_type(1) == element::u8 || get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
-        "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");
-
-    #endif
+    auto input_size = get_input_size();
+    if(input_size == 2) {
+        FRONT_END_GENERAL_CHECK(
+            // WA: f32 appeared as a placeholder for unknown type during intermediate conversion steps
+            get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
+            "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");
+    } else if (input_size == 4) {
+        FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
+        FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
+        FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");
+    } else {
+        OPENVINO_THROW("Unexpected input format. SentencepieceTokenizer accepts one string input or three decomposed string inputs (begins, ends, symbols)");
+    };
 
     // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values
     // and dense shape
@@ -133,17 +128,7 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
     std::vector<int32_t> sparse_values;
     std::vector<int64_t> sparse_dense_shape;
 
-#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS
-
-    auto begin_ids = inputs[1].data<const int32_t>();
-    auto end_ids = inputs[2].data<const int32_t>();
-    auto data = inputs[3].data<const uint8_t>();
-
-    auto batch_size = shape_size(inputs[1].get_shape());
-
-#else
-
-    auto input_element_type = get_input_element_type(1);
+    auto input_size = get_input_size();
     int32_t batch_size;
 
     // used in case of string tensors
@@ -154,27 +139,31 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
     const int32_t* end_ids;
     const uint8_t* data;
 
-    if(input_element_type == ov::element::string) {
-        strings = inputs[1].data<const std::string>();
-        batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
-    } else if(input_element_type == ov::element::u8) {
-        parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data);
+    if (input_size == 2) {
+        auto input_element_type = get_input_element_type(1);
+        if(input_element_type == ov::element::string) {
+            strings = inputs[1].data<const std::string>();
+            batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
+        } else {
+            OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
+        }
     } else {
-        OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
-    }
-
-#endif
+        auto begin_ids = inputs[1].data<const int32_t>();
+        auto end_ids = inputs[2].data<const int32_t>();
+        auto data = inputs[3].data<const uint8_t>();
+        batch_size = shape_size(inputs[1].get_shape());
+    };
 
     size_t max_token_id = 0;
     for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) {
         absl::string_view sentence;
-        if(input_element_type == ov::element::string) {
+        if (input_size == 2) {
             sentence = strings[batch_ind];
-        } else if(input_element_type == ov::element::u8) {
+        } else {
             auto begin_ind = begin_ids[batch_ind];
             auto end_ind = end_ids[batch_ind];
             sentence = absl::string_view((const char*)data + begin_ind, end_ind - begin_ind);
-        }
+        };
 
         std::vector<int32_t> ids;
         CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids));

diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp
@@ -60,7 +60,7 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
     auto sp_model_const = as_type_ptr<Constant>(sp_tokenize_op->input_value(0).get_node_shared_ptr());
     FRONT_END_GENERAL_CHECK(sp_model_const, "Conversion expects SentencePiece model to be constant.");
 
-    // prepare input six inputs
+    // prepare input
     auto inputs = sp_tokenize_op->input_value(1);
 
     // extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes
@@ -70,27 +70,8 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
     auto add_eos = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos");
     auto reverse = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse");
 
-#if !USE_STRING_TENSORS
-    // Override type of input tensor if this is a Parameter
-    if (auto parameter = std::dynamic_pointer_cast<Parameter>(inputs.get_node_shared_ptr())) {
-        parameter->set_partial_shape(PartialShape{ Dimension() });
-        parameter->set_element_type(element::u8);
-        parameter->validate_and_infer_types();
-    }
-#endif
-
-#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS
-
-    OutputVector inputs_vector = OutputVector{ sp_model_const };
-    auto unpacked_outputs = std::make_shared<StringTensorUnpack>(OutputVector{inputs}, "begins_ends")->outputs();
-    inputs_vector.insert(inputs_vector.end(), unpacked_outputs.begin(), unpacked_outputs.end());
-
-#else
-
     OutputVector inputs_vector = OutputVector{ sp_model_const, inputs };
 
-#endif
-
     // create a node with custom operation
     auto sp_tokenizer_ext = std::make_shared<SentencepieceTokenizer>(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse);
     FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3,
@@ -182,7 +163,6 @@ ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext&
 
     auto wp_tokenizer_inputs = wp_tokenizer->input_values();
     wp_tokenizer_inputs.push_back(unk_token_id);
-    //std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n";
 
     auto new_wp_tokenizer = wp_tokenizer->clone_with_new_inputs(wp_tokenizer_inputs);
     return { post_translate_ragged_tensor_output(new_wp_tokenizer->outputs()) };
@@ -209,7 +189,6 @@ ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) {
         auto reshape = std::make_shared<Reshape>(tensor, shape, false);
         return {reshape};
     }
-    // set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals
 }
 
 // Copied and pasted from TF FE and adopted to not use internal TF FE operation classes
@@ -232,9 +211,7 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) {
             const_node = std::make_shared<ov::op::util::FrameworkNode>(OutputVector{});
         }
     } else {
-        //static std::vector<ov::Tensor> tensors;
         auto tensor = node.get_attribute<ov::Tensor>("value");
-        //tensors.push_back(tensor);
         const_node = std::make_shared<Constant>(tensor);
         #if OPENVINO_ELEMENT_STRING_SUPPORTED
         if (const_node->get_element_type() == element::string) {
@@ -246,6 +223,5 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) {
         }
         #endif
     }
-    //set_node_name(node.get_name(), const_node);   // TODO: Provide alternative to internal function set_node_name
     return {const_node};
 }
diff --git a/modules/nvidia_plugin/src/error.cpp b/modules/nvidia_plugin/src/error.cpp
@@ -11,6 +11,11 @@
 namespace ov {
 namespace nvidia_gpu {
 namespace {
+class OVExceptionWrapper : public ov::Exception {
+public:
+    OVExceptionWrapper(const std::string& what) : ov::Exception(what) {}
+};
+
 template <typename T>
 [[gnu::cold, noreturn]] void throw_exception(const std::string& msg,
                                              const std::experimental::source_location& location) {
@@ -20,7 +25,7 @@ template <typename T>
 
 [[gnu::cold, noreturn]] void throw_ov_exception(const std::string& msg,
                                                 const std::experimental::source_location& location) {
-    throw_exception<ov::Exception>(msg, location);
+    throw_exception<OVExceptionWrapper>(msg, location);
 }
 
 [[gnu::cold]] void logError(const std::string& /*msg*/, const std::experimental::source_location& /*location*/) {

diff --git a/modules/nvidia_plugin/tests/functional/core_config.cpp b/modules/nvidia_plugin/tests/functional/core_config.cpp
@@ -2,26 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "functional_test_utils/core_config.hpp"
-
 #include "cuda_test_constants.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
 
-void CoreConfiguration(LayerTestsUtils::LayerTestsCommon* test) {
-    std::shared_ptr<InferenceEngine::Core> core = PluginCache::get().ie();
-    ov::element::Type hint = ov::element::f32;
-    for (auto& param : test->GetFunction()->get_parameters()) {
-        if (param->get_output_element_type(0) == ov::element::f16) {
-            hint = ov::element::f16;
-            break;
-        }
-    }
-    // Set inference_precision hint to run fp32 model in fp32 runtime precision as default plugin execution precision
-    // may vary
-    std::map<std::string, std::string> config = {{ov::hint::inference_precision.name(), hint.get_type_name()}};
-    core->SetConfig(config, ov::test::utils::DEVICE_NVIDIA);
-}
-
 namespace ov {
 namespace test {
 

diff --git a/modules/openvino_code/README.md b/modules/openvino_code/README.md
@@ -8,6 +8,7 @@ OpenVINO Code provides the following features:
 
 - Inline Code Completion
 - Summarization via Docstring
+- Fill in the Middle Mode
 
 ## Working with Extension
 
@@ -48,6 +49,23 @@ You can select the desired type of quotes in the extension settings.
 The model can generate docstring in Code Completion mode, but in this case it is impossible to control the result. 
 In the docstring generation mode, various popular templates are available in the settings that will guide the model output.
 
+### Fill in the Middle Mode
+
+
+1. Create a new Python file or open an existing one.
+1. Type `def main():` or place the cursor where you'd like middle text to be generated.
+1. Press the keyboard shortcut `Ctrl+Alt+Space` (`Cmd+Alt+Space` for macOS) or click the `Generate Code Completion` button located in the side panel.
+1. You can select the text then generate the related code.
+1. You may also right-click on "Generate Inline Code Completion In New Tab" to generate code in a new tab.
+1. Use the `Tab` key to accept the entire suggestion or `Ctrl`+`Right Arrow` to accept it word by word. To decline the suggestion, press `Esc`.
+
+You can customize the length of the generated code by adjusting `Max New Tokens` and `Min New Tokens` parameters in the extension settings. 
+The number of generated tokens is also influenced by the `Server Request Timeout` setting.
+
+Fill in the middle mode brings in advanced code completion capabilities supporting fill-in-the-blank task, supporting project-level code completion and infilling tasks.
+
+To enable fill in the middle mode, check the `Fill In The Middle Mode` checkbox in the extension settings.
+
 ### Monitoring Extension Output
 
 To examine the input and output from the code generation API, follow these steps:

diff --git a/modules/openvino_code/package-lock.json b/modules/openvino_code/package-lock.json
diff --git a/modules/openvino_code/package.json b/modules/openvino_code/package.json
@@ -1,7 +1,7 @@
 {
   "publisher": "OpenVINO",
   "name": "openvino-code-completion",
-  "version": "0.0.6",
+  "version": "0.0.8",
   "displayName": "OpenVINO Code Completion",
   "description": "VSCode extension for AI code completion with OpenVINO",
   "icon": "media/logo.png",
@@ -190,11 +190,12 @@
           "openvinoCode.model": {
             "order": 0,
             "type": "string",
-            "default": "codet5p-220m-py",
+            "default": "code-t5",
             "enum": [
-              "codet5p-220m-py",
-              "decicoder-1b-openvino-int8",
-              "stablecode-completion-3b-int8"
+              "code-t5",
+              "decicoder-1b-openvino",
+              "stablecode-completion",
+              "deepseek-coder"
             ],
             "description": "Which model to use for code generation."
           },
@@ -229,6 +230,40 @@
             "default": "false",
             "description": "When checked inline complention will be generated in streaming mode"
           },
+          "openvinoCode.fillInTheMiddleMode": {
+            "order": 4,
+            "type": "boolean",
+            "default": "false",
+            "description":
+              "When checked, text before (above) and after (below) the cursor will be used for completion generation. When unckecked, only text before (above) the cursor will be used."
+          },
+          "openvinoCode.startToken": {
+            "order": 7,
+            "type": "string",
+            "default": "< |fim_begin| >",
+            "description":
+              "String that is sent to server is in format: `{startToken}{text above cursor}{middleToken}{text below cursor if fillInTheMiddleMode=true}{endToken}`. Leave `startToken`, `middleToken`, or `endToken` empty if there is no special token for those placements."
+          },
+          "openvinoCode.middleToken": {
+            "order": 8,
+            "type": "string",
+            "default": "<｜fim▁hole｜>",
+            "description":
+              "String that is sent to server is in format: `{startToken}{text above cursor}{middleToken}{text below cursor if fillInTheMiddleMode=true}{endToken}`. Leave `startToken`, `middleToken`, or `endToken` empty if there is no special token for those placements."
+          },
+          "openvinoCode.endToken": {
+            "order": 9,
+            "type": "string",
+            "default": "<｜fim▁end｜>",
+            "description":
+              "String that is sent to server is in format: `{startToken}{text above cursor}{middleToken}{text below cursor if fillInTheMiddleMode=true}{endToken}`. Leave `startToken`, `middleToken`, or `endToken` empty if there is no special token for those placements."
+          },
+          "openvinoCode.stopToken": {
+            "order": 10,
+            "type": "string",
+            "default": "<|endoftext|>",
+            "description": "(Optional) Stop token."
+          },
           "openvinoCode.temperature": {
             "order": 4,
             "type": "number",

diff --git a/modules/openvino_code/shared/features.ts b/modules/openvino_code/shared/features.ts
@@ -1,4 +1,5 @@
 export enum Features {
   CODE_COMPLETION = 'Code Completion',
   SUMMARIZATION = 'Summarization',
+  FIM = 'Fill-in-the-middle',
 }