openvinotoolkit · apaniukov · Sep 12, 2023 · Sep 12, 2023 · Sep 12, 2023
@@ -0,0 +1,75 @@
+# OpenVINO Code - VSCode extension for AI code completion with OpenVINO™
+
+VSCode extension for helping developers writing code with AI code assistant. OpenVINO Code is working with Large Language Model for Code (Code LLM) deployed on local or remote server.
+
+## Installing Extension
+
+VSCode extension can be installed from built `*.vsix` file:
+
+1. Open `Extensions` side bar in VSCode.
+2. Click on the menu icon (three dots menu icon aka "meatballs" icon) in the top right corner of Extensions side panel.
+3. Select "Instal from VSIX..." option and select extension file.
+
+For instructions on how to build extension `vsix` file please refer to the [Build Extension](#build-extension) section.
+
+## Extension Configuration
+
+To work with extension you should configure endpoint to server with Code LLM where requests will be sent:
+
+1. Open extension settings.
+2. Fill `Server URL` parameter with server endpoint URL.
+
+For instructions on how to start server locally please refer to the [server README.md](./server/README.md).
+
+Also in extension settings you can configure special tokens.
+
+## Working with Extension
+
+TDB
+
+1. Create a new python file
+2. Try typing `def main():`
+3. Press shortcut buttons (TBD) for code completion
+
+### Checking output
+
+You can see input to and output from the code generation API:
+
+1. Open VSCode `OUTPUT` panel
+2. Select extension output source from the dropdown menu
+
+## Developing
+
+> **Prerequisite:** You should have `Node.js` installed (v16 and above).
+
+#### Install dependencies
+
+To install dependencies run the following command from the project root directory:
+
+```
+npm install
+```
+
+#### Run Extension from Source & Debugging
+
+Open `Run and Debug` side bar in VSCode and click `Launch Extension` (or press `F5`).
+
+#### Build Extension
+
+To build extension and generate `*.vsix` file for further installation in VSCode, run the following command:
+
+```
+npm run vsce:package
+```
+
+#### Linting
+
+To perform linting with `ESLint`, execute the following command:
+
+```
+npm run lint
+```
+
+#### Testing
+
+TBD
@@ -13,7 +13,7 @@ OpenVINO Code provides the following features:
 
 1. Create a new python file
 2. Try typing `def main():`
-3. Press shortcut buttons (TBD) for code completion
+3. Press shortcut button `ctrl+alt+space` for code completion
 
 ### Checking output
 

@@ -1,7 +1,7 @@
 {
   "publisher": "OpenVINO",
   "name": "openvino-code-completion",
-  "version": "0.0.2",
+  "version": "0.0.3",
   "displayName": "OpenVINO Code Completion",
   "description": "VSCode extension for AI code completion with OpenVINO",
   "icon": "media/logo.png",
@@ -188,38 +188,44 @@
             "default": 30,
             "markdownDescription": "Server request timeout in seconds after which request will be aborted."
           },
-          "openvinoCode.fillInTheMiddleMode": {
+          "openvinoCode.streamInlineCompletion": {
             "order": 3,
             "type": "boolean",
+            "default": "false",
+            "description": "When checked inline complention will be generated in streaming mode"
+          },
+          "openvinoCode.fillInTheMiddleMode": {
+            "order": 4,
+            "type": "boolean",
             "default": false,
             "markdownDescription": "When checked, text before (above) and after (below) the cursor will be used for completion generation. When unckecked, only text before (above) the cursor will be used."
           },
           "openvinoCode.temperature": {
-            "order": 4,
+            "order": 5,
             "type": "number",
             "default": 0.2,
-            "description": "Sampling temperature."
+            "description": "Non-zero value. The higher the value, the more diverse the code suggestions and the lower temperature emphasizes the most likely words."
           },
           "openvinoCode.topK": {
-            "order": 4,
+            "order": 5,
             "type": "integer",
             "default": 10,
-            "description": "Top K."
+            "description": "Select the next word during suggestion generation from the top K candidates. Improves diversity of generated suggestions."
           },
           "openvinoCode.topP": {
-            "order": 4,
+            "order": 5,
             "type": "number",
             "default": 1,
-            "description": "Top P."
+            "description": "A value between 0 and 1. Similar to Top K, it adjusts the number of candidate words based on their probability. Candidates will be added for selection until the cumulative probability exceeds P."
           },
           "openvinoCode.minNewTokens": {
-            "order": 5,
+            "order": 6,
             "type": "number",
             "default": 1,
             "description": "Minimum of new generated tokens."
           },
           "openvinoCode.maxNewTokens": {
-            "order": 5,
+            "order": 6,
             "type": "number",
             "default": 100,
             "description": "Maximum of new generated tokens."
@@ -280,6 +286,12 @@
         "key": "ctrl+alt+space",
         "mac": "ctrl+alt+space",
         "when": "editorTextFocus"
+      },
+      {
+        "command": "openvinoCode.stopGeneration",
+        "key": "escape",
+        "mac": "escape",
+        "when": "openvinoCode.generating"
       }
     ]
   },

@@ -4,22 +4,21 @@ version = "0.0.1"
 requires-python = ">=3.8"
 
 dependencies = [
-    'fastapi==0.101.0',
-    'uvicorn==0.23.1',
+    'fastapi==0.103.1',
+    'uvicorn==0.23.2',
     'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp38-cp38-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.8"',
     'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp39-cp39-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.9"',
     'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp310-cp310-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.10"',
     'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp311-cp311-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.11"',
     'torch ; sys_platform != "linux"',
     'openvino==2023.1.0.dev20230811',
-    'optimum-intel[openvino]==1.11.0',
+    'transformers==4.31.0',
+    'optimum==1.12.0',
+    'optimum-intel[openvino]==1.10.1',
 ]
 
 [project.optional-dependencies]
-dev = [
-    "black",
-    "ruff",
-]
+dev = ["black", "ruff"]
 
 [build-system]
 requires = ["setuptools>=43.0.0", "wheel"]

@@ -1,9 +1,9 @@
 from time import perf_counter
 from typing import Dict, Union
 
-from fastapi import Depends, FastAPI
+from fastapi import Depends, FastAPI, Request
 from fastapi.responses import RedirectResponse, StreamingResponse
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, TypeAdapter
 
 from src.generators import GeneratorFunctor
 from src.utils import get_logger
@@ -105,11 +105,12 @@ async def generate(
 
 @app.post("/api/generate_stream", status_code=200)
 async def generate_stream(
-    request: GenerationRequest,
+    request: Request,
     generator: GeneratorFunctor = Depends(get_generator_dummy),
 ) -> StreamingResponse:
-    logger.info(request)
-    return StreamingResponse(generator.generate_stream(request.inputs, request.parameters.model_dump()))
+    generation_request = TypeAdapter(GenerationRequest).validate_python(await request.json())
+    logger.info(generation_request)
+    return StreamingResponse(generator.generate_stream(generation_request.inputs, generation_request.parameters.model_dump(), request))
 
 
 @app.post("/api/summarize", status_code=200, response_model=GenerationResponse)

@@ -1,3 +1,4 @@
+import asyncio
 import re
 from functools import lru_cache
 from io import StringIO
@@ -6,6 +7,7 @@
 from typing import Any, Callable, Container, Dict, Generator, List, Optional, Type, Union
 
 import torch
+from fastapi import Request
 from huggingface_hub.utils import EntryNotFoundError
 from optimum.intel import OVModelForCausalLM, OVModelForSeq2SeqLM
 from transformers import (
@@ -61,11 +63,15 @@ def get_model(checkpoint: str, device: str = "CPU") -> OVModel:
     return model
 
 
+# TODO: generator needs running flag or cancellation on new generation request
+# generator cannot handle concurrent requests - fails and stalls process
+# RuntimeError: Exception from src/inference/src/infer_request.cpp:189:
+# [ REQUEST_BUSY ]
 class GeneratorFunctor:
     def __call__(self, input_text: str, parameters: Dict[str, Any]) -> str:
         raise NotImplementedError
 
-    async def generate_stream(self, input_text: str, parameters: Dict[str, Any]):
+    async def generate_stream(self, input_text: str, parameters: Dict[str, Any], request: Request):
         raise NotImplementedError
 
     def summarize(self, input_text: str, template: str, signature: str, style: str, parameters: Dict[str, Any]):
@@ -122,24 +128,45 @@ def __call__(
         logger.info(f"Number of input tokens: {prompt_len}; generated {len(output_ids)} tokens")
         return self.tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 
-    async def generate_stream(
-        self, input_text: str, parameters: Dict[str, Any], stopping_criteria: Optional[StoppingCriteriaList] = None
-    ):
+    async def generate_stream(self, input_text: str, parameters: Dict[str, Any], request: Request = None):
         input_ids = self.tokenizer.encode(input_text, return_tensors="pt")
         streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
         parameters["streamer"] = streamer
         config = GenerationConfig.from_dict({**self.generation_config.to_dict(), **parameters})
+
+        stop_on_tokens = StopOnTokens([])
+
         generation_kwargs = dict(
             input_ids=input_ids,
             streamer=streamer,
-            stopping_criteria=stopping_criteria,
+            stopping_criteria=StoppingCriteriaList([stop_on_tokens]),
             **config.to_dict(),
         )
+
+        # listen disconnect event so generation can be stopped
+        def listen_for_disconnect():
+            async def listen():
+                message = await request.receive()
+                if message.get("type") == "http.disconnect":
+                    stop_on_tokens.cancelled = True
+            asyncio.create_task(listen())
+
+
+        listen_thread = Thread(target=listen_for_disconnect)
+        # thread.run doesn't actually start a new thread
+        # it runs the thread function in current thread context
+        # thread.start() doesn't work here
+        listen_thread.run()
+
         thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
         thread.start()
+
         for token in streamer:
+            await asyncio.sleep(0.01)
             yield token
 
+        thread.join()
+
     def generate_between(
         self,
         input_parts: List[str],
@@ -243,7 +270,10 @@ def inner() -> GeneratorFunctor:
 
 class StopOnTokens(StoppingCriteria):
     def __init__(self, token_ids: List[int]) -> None:
+        self.cancelled = False
         self.token_ids = torch.tensor(token_ids, requires_grad=False)
 
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        if self.cancelled:
+            return True
         return torch.any(torch.eq(input_ids[0, -1], self.token_ids)).item()
@@ -8,6 +8,7 @@ export type CustomConfiguration = {
   model: ModelName;
   serverUrl: string;
   serverRequestTimeout: number;
+  streamInlineCompletion: boolean;
   fillInTheMiddleMode: boolean;
   temperature: number;
   topK: number;

@@ -24,4 +24,9 @@ export const COMMANDS = {
   STOP_SERVER_NATIVE: 'openvinoCode.stopServerNative',
   SHOW_SERVER_LOG: 'openvinoCode.showServerLog',
   SHOW_EXTENSION_LOG: 'openvinoCode.showExtensionLog',
+  STOP_GENERATION: 'openvinoCode.stopGeneration',
+};
+
+export const EXTENSION_CONTEXT_STATE = {
+  GENERATING: 'openvinoCode.generating',
 };
@@ -1,7 +1,7 @@
 import { InlineCompletionItem, Position, Range, TextDocument, window } from 'vscode';
-import { backendService } from '../services/backend.service';
-import { extensionState } from '../state';
 import { EXTENSION_DISPLAY_NAME } from '../constants';
+import { IGenerateRequest, backendService } from '../services/backend.service';
+import { extensionState } from '../state';
 
 const outputChannel = window.createOutputChannel(EXTENSION_DISPLAY_NAME, { log: true });
 const logCompletionInput = (input: string): void => outputChannel.append(`Completion input:\n${input}\n\n`);
@@ -67,6 +67,41 @@ class CompletionService {
     const completionItem = new InlineCompletionItem(generatedText, new Range(position, position.translate(0, 1)));
     return [completionItem];
   }
+
+  async getCompletionStream(
+    document: TextDocument,
+    position: Position,
+    onDataChunk: (chunk: string) => unknown,
+    signal?: AbortSignal
+  ) {
+    const textBeforeCursor = this._getTextBeforeCursor(document, position);
+    const textAfterCursor = this._getTextAfterCursor(document, position);
+    const completionInput = this._prepareCompletionInput(textBeforeCursor, textAfterCursor);
+    logCompletionInput(completionInput);
+
+    const { temperature, topK, topP, minNewTokens, maxNewTokens } = extensionState.config;
+
+    const request: IGenerateRequest = {
+      inputs: completionInput,
+      parameters: {
+        temperature,
+        top_k: topK,
+        top_p: topP,
+        min_new_tokens: minNewTokens,
+        max_new_tokens: maxNewTokens,
+      },
+    };
+
+    outputChannel.append(`Completion output:\n`);
+    return backendService.generateCompletionStream(
+      request,
+      (chunk) => {
+        outputChannel.append(chunk);
+        onDataChunk(chunk);
+      },
+      signal
+    );
+  }
 }
 
 export default new CompletionService();