Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into paged-attention
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov committed Feb 12, 2024
2 parents 980fabc + 468701a commit b36e57a
Show file tree
Hide file tree
Showing 14 changed files with 117 additions and 102 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ void reshape_and_cache_cpu_impl(
}
}; // namespace

void reshape_and_cache(ov::Tensor key, ov::Tensor value,
ov::Tensor key_cache, ov::Tensor value_cache,
ov::Tensor slot_mapping) {
void reshape_and_cache_cpu(ov::Tensor key, ov::Tensor value,
ov::Tensor key_cache, ov::Tensor value_cache,
ov::Tensor slot_mapping) {
ov::Shape key_shape = key.get_shape(), key_cache_shape = key_cache.get_shape();
int num_tokens = key_shape[0];
int num_heads = key_shape[1];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ TemplateExtension::PagedAttention::PagedAttention(const ov::OutputVector& inputs
// compile model for prefill stage
std::call_once(m_once, [_this=this] () {
ov::Core core;
core.register_plugin("/mnt/data3_1878/ilya/Documents/Programming/git_repo/openvino/bin/intel64/Release/libopenvino_intel_cpu_plugin.so", "CPU2");
auto compiled_model = core.compile_model(make_prefill_subgraph(), "CPU2");
auto compiled_model = core.compile_model(make_prefill_subgraph(), "CPU");
_this->m_prefill_request = compiled_model.create_infer_request();
});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,12 @@ def greedy_decoder(input) -> Model:
return token_ids.output(0)


def add_greedy_decoding(text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME) -> Model:
def add_greedy_decoding(
text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
) -> Model:
ppp = PrePostProcessor(text_generation_model)
ppp.output(logits_output).postprocess().custom(greedy_decoder)
ppp.output(logits_output).tensor().set_element_type(output_type)
model = ppp.build()
model.output(logits_output).tensor.set_names({TOKEN_IDS_OUTPUT_NAME})
return model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,26 +91,21 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s
}

void SentencepieceTokenizer::validate_and_infer_types() {

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)");
FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");

#else

FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences");
FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");

FRONT_END_GENERAL_CHECK(
// WA: sometimes f32 appeared as a placeholder for unknown type
get_input_element_type(1) == element::u8 || get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
"SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");

#endif
auto input_size = get_input_size();
if(input_size == 2) {
FRONT_END_GENERAL_CHECK(
// WA: f32 appeared as a placeholder for unknown type during intermediate conversion steps
get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
"SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");
} else if (input_size == 4) {
FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");
} else {
OPENVINO_THROW("Unexpected input format. SentencepieceTokenizer accepts one string input or three decomposed string inputs (begins, ends, symbols)");
};

// The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values
// and dense shape
Expand All @@ -133,17 +128,7 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
std::vector<int32_t> sparse_values;
std::vector<int64_t> sparse_dense_shape;

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

auto begin_ids = inputs[1].data<const int32_t>();
auto end_ids = inputs[2].data<const int32_t>();
auto data = inputs[3].data<const uint8_t>();

auto batch_size = shape_size(inputs[1].get_shape());

#else

auto input_element_type = get_input_element_type(1);
auto input_size = get_input_size();
int32_t batch_size;

// used in case of string tensors
Expand All @@ -154,27 +139,31 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
const int32_t* end_ids;
const uint8_t* data;

if(input_element_type == ov::element::string) {
strings = inputs[1].data<const std::string>();
batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
} else if(input_element_type == ov::element::u8) {
parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data);
if (input_size == 2) {
auto input_element_type = get_input_element_type(1);
if(input_element_type == ov::element::string) {
strings = inputs[1].data<const std::string>();
batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
} else {
OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
}
} else {
OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
}

#endif
auto begin_ids = inputs[1].data<const int32_t>();
auto end_ids = inputs[2].data<const int32_t>();
auto data = inputs[3].data<const uint8_t>();
batch_size = shape_size(inputs[1].get_shape());
};

size_t max_token_id = 0;
for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) {
absl::string_view sentence;
if(input_element_type == ov::element::string) {
if (input_size == 2) {
sentence = strings[batch_ind];
} else if(input_element_type == ov::element::u8) {
} else {
auto begin_ind = begin_ids[batch_ind];
auto end_ind = end_ids[batch_ind];
sentence = absl::string_view((const char*)data + begin_ind, end_ind - begin_ind);
}
};

std::vector<int32_t> ids;
CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
auto sp_model_const = as_type_ptr<Constant>(sp_tokenize_op->input_value(0).get_node_shared_ptr());
FRONT_END_GENERAL_CHECK(sp_model_const, "Conversion expects SentencePiece model to be constant.");

// prepare input six inputs
// prepare input
auto inputs = sp_tokenize_op->input_value(1);

// extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes
Expand All @@ -70,27 +70,8 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
auto add_eos = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos");
auto reverse = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse");

#if !USE_STRING_TENSORS
// Override type of input tensor if this is a Parameter
if (auto parameter = std::dynamic_pointer_cast<Parameter>(inputs.get_node_shared_ptr())) {
parameter->set_partial_shape(PartialShape{ Dimension() });
parameter->set_element_type(element::u8);
parameter->validate_and_infer_types();
}
#endif

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

OutputVector inputs_vector = OutputVector{ sp_model_const };
auto unpacked_outputs = std::make_shared<StringTensorUnpack>(OutputVector{inputs}, "begins_ends")->outputs();
inputs_vector.insert(inputs_vector.end(), unpacked_outputs.begin(), unpacked_outputs.end());

#else

OutputVector inputs_vector = OutputVector{ sp_model_const, inputs };

#endif

// create a node with custom operation
auto sp_tokenizer_ext = std::make_shared<SentencepieceTokenizer>(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse);
FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3,
Expand Down Expand Up @@ -182,7 +163,6 @@ ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext&

auto wp_tokenizer_inputs = wp_tokenizer->input_values();
wp_tokenizer_inputs.push_back(unk_token_id);
//std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n";

auto new_wp_tokenizer = wp_tokenizer->clone_with_new_inputs(wp_tokenizer_inputs);
return { post_translate_ragged_tensor_output(new_wp_tokenizer->outputs()) };
Expand All @@ -209,7 +189,6 @@ ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) {
auto reshape = std::make_shared<Reshape>(tensor, shape, false);
return {reshape};
}
// set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals
}

// Copied and pasted from TF FE and adopted to not use internal TF FE operation classes
Expand All @@ -232,9 +211,7 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) {
const_node = std::make_shared<ov::op::util::FrameworkNode>(OutputVector{});
}
} else {
//static std::vector<ov::Tensor> tensors;
auto tensor = node.get_attribute<ov::Tensor>("value");
//tensors.push_back(tensor);
const_node = std::make_shared<Constant>(tensor);
#if OPENVINO_ELEMENT_STRING_SUPPORTED
if (const_node->get_element_type() == element::string) {
Expand All @@ -246,6 +223,5 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) {
}
#endif
}
//set_node_name(node.get_name(), const_node); // TODO: Provide alternative to internal function set_node_name
return {const_node};
}
7 changes: 6 additions & 1 deletion modules/nvidia_plugin/src/error.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
namespace ov {
namespace nvidia_gpu {
namespace {
class OVExceptionWrapper : public ov::Exception {
public:
OVExceptionWrapper(const std::string& what) : ov::Exception(what) {}
};

template <typename T>
[[gnu::cold, noreturn]] void throw_exception(const std::string& msg,
const std::experimental::source_location& location) {
Expand All @@ -20,7 +25,7 @@ template <typename T>

[[gnu::cold, noreturn]] void throw_ov_exception(const std::string& msg,
const std::experimental::source_location& location) {
throw_exception<ov::Exception>(msg, location);
throw_exception<OVExceptionWrapper>(msg, location);
}

[[gnu::cold]] void logError(const std::string& /*msg*/, const std::experimental::source_location& /*location*/) {
Expand Down
17 changes: 0 additions & 17 deletions modules/nvidia_plugin/tests/functional/core_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "functional_test_utils/core_config.hpp"

#include "cuda_test_constants.hpp"
#include "shared_test_classes/base/ov_subgraph.hpp"

void CoreConfiguration(LayerTestsUtils::LayerTestsCommon* test) {
std::shared_ptr<InferenceEngine::Core> core = PluginCache::get().ie();
ov::element::Type hint = ov::element::f32;
for (auto& param : test->GetFunction()->get_parameters()) {
if (param->get_output_element_type(0) == ov::element::f16) {
hint = ov::element::f16;
break;
}
}
// Set inference_precision hint to run fp32 model in fp32 runtime precision as default plugin execution precision
// may vary
std::map<std::string, std::string> config = {{ov::hint::inference_precision.name(), hint.get_type_name()}};
core->SetConfig(config, ov::test::utils::DEVICE_NVIDIA);
}

namespace ov {
namespace test {

Expand Down
18 changes: 18 additions & 0 deletions modules/openvino_code/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ OpenVINO Code provides the following features:

- Inline Code Completion
- Summarization via Docstring
- Fill in the Middle Mode

## Working with Extension

Expand Down Expand Up @@ -48,6 +49,23 @@ You can select the desired type of quotes in the extension settings.
The model can generate docstring in Code Completion mode, but in this case it is impossible to control the result.
In the docstring generation mode, various popular templates are available in the settings that will guide the model output.

### Fill in the Middle Mode


1. Create a new Python file or open an existing one.
1. Type `def main():` or place the cursor where you'd like middle text to be generated.
1. Press the keyboard shortcut `Ctrl+Alt+Space` (`Cmd+Alt+Space` for macOS) or click the `Generate Code Completion` button located in the side panel.
1. You can select the text then generate the related code.
1. You may also right-click on "Generate Inline Code Completion In New Tab" to generate code in a new tab.
1. Use the `Tab` key to accept the entire suggestion or `Ctrl`+`Right Arrow` to accept it word by word. To decline the suggestion, press `Esc`.

You can customize the length of the generated code by adjusting `Max New Tokens` and `Min New Tokens` parameters in the extension settings.
The number of generated tokens is also influenced by the `Server Request Timeout` setting.

Fill in the middle mode brings in advanced code completion capabilities supporting fill-in-the-blank task, supporting project-level code completion and infilling tasks.

To enable fill in the middle mode, check the `Fill In The Middle Mode` checkbox in the extension settings.

### Monitoring Extension Output

To examine the input and output from the code generation API, follow these steps:
Expand Down
4 changes: 2 additions & 2 deletions modules/openvino_code/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 40 additions & 5 deletions modules/openvino_code/package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"publisher": "OpenVINO",
"name": "openvino-code-completion",
"version": "0.0.6",
"version": "0.0.8",
"displayName": "OpenVINO Code Completion",
"description": "VSCode extension for AI code completion with OpenVINO",
"icon": "media/logo.png",
Expand Down Expand Up @@ -190,11 +190,12 @@
"openvinoCode.model": {
"order": 0,
"type": "string",
"default": "codet5p-220m-py",
"default": "code-t5",
"enum": [
"codet5p-220m-py",
"decicoder-1b-openvino-int8",
"stablecode-completion-3b-int8"
"code-t5",
"decicoder-1b-openvino",
"stablecode-completion",
"deepseek-coder"
],
"description": "Which model to use for code generation."
},
Expand Down Expand Up @@ -229,6 +230,40 @@
"default": "false",
"description": "When checked inline complention will be generated in streaming mode"
},
"openvinoCode.fillInTheMiddleMode": {
"order": 4,
"type": "boolean",
"default": "false",
"description":
"When checked, text before (above) and after (below) the cursor will be used for completion generation. When unckecked, only text before (above) the cursor will be used."
},
"openvinoCode.startToken": {
"order": 7,
"type": "string",
"default": "< |fim_begin| >",
"description":
"String that is sent to server is in format: `{startToken}{text above cursor}{middleToken}{text below cursor if fillInTheMiddleMode=true}{endToken}`. Leave `startToken`, `middleToken`, or `endToken` empty if there is no special token for those placements."
},
"openvinoCode.middleToken": {
"order": 8,
"type": "string",
"default": "<|fim▁hole|>",
"description":
"String that is sent to server is in format: `{startToken}{text above cursor}{middleToken}{text below cursor if fillInTheMiddleMode=true}{endToken}`. Leave `startToken`, `middleToken`, or `endToken` empty if there is no special token for those placements."
},
"openvinoCode.endToken": {
"order": 9,
"type": "string",
"default": "<|fim▁end|>",
"description":
"String that is sent to server is in format: `{startToken}{text above cursor}{middleToken}{text below cursor if fillInTheMiddleMode=true}{endToken}`. Leave `startToken`, `middleToken`, or `endToken` empty if there is no special token for those placements."
},
"openvinoCode.stopToken": {
"order": 10,
"type": "string",
"default": "<|endoftext|>",
"description": "(Optional) Stop token."
},
"openvinoCode.temperature": {
"order": 4,
"type": "number",
Expand Down
1 change: 1 addition & 0 deletions modules/openvino_code/shared/features.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
export enum Features {
CODE_COMPLETION = 'Code Completion',
SUMMARIZATION = 'Summarization',
FIM = 'Fill-in-the-middle',
}
Loading

0 comments on commit b36e57a

Please sign in to comment.