.github/workflows/causal_lm_cpp.yml

name: causal_lm_cpp
on:
  pull_request:
    paths:
      - .github/workflows/causal_lm_cpp.yml
      - src/**
      - samples/**
      - thirdparty/openvino_tokenizers
      - "!**.md"
permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions
concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

env:
  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/l_openvino_toolkit_ubuntu20_2024.5.0.dev20240830_x86_64.tgz
  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/m_openvino_toolkit_macos_12_6_2024.5.0.dev20240830_x86_64.tgz
  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16570-19eb02fe60b/w_openvino_toolkit_windows_2024.5.0.dev20240830_x86_64.zip
jobs:
  cpp-multinomial-greedy_causal_lm-ubuntu:
    runs-on: ubuntu-20.04-8-cores
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
      - run: >
          . ./ov/setupvars.sh
          && PYTHONPATH=./build/:$PYTHONPATH timeout 25s
          ./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./open_llama_3b_v2/ a
      - run: >
          . ./ov/setupvars.sh
          && PYTHONPATH=./build/:$PYTHONPATH timeout 25s
          ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b
      - run: >
          . ./ov/setupvars.sh
          && export PYTHONPATH=./build/:$PYTHONPATH
          && timeout 25s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
          | diff <(timeout 25s samples/python/greedy_causal_lm/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") -

  cpp-beam_search_causal_lm-ubuntu:
    strategy:
      matrix:
        executable:
          [
            ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm,
            python ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py,
          ]
    runs-on: ubuntu-20.04
    if: ${{ false }} # fails because of UNICODE output
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
      - name: Compare
        run: |
          source ./ov/setupvars.sh
          export PYTHONPATH=./build/:$PYTHONPATH  # C++ ignores that

          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
          tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt')
          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo "Why is the Sun yellow?" passed

          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
          tokenized = tokenizer('69', return_tensors='pt')
          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo 69 passed

          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
          tokenized = tokenizer('Hi', return_tensors='pt')
          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo "Hi" passed

          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
          tokenized = tokenizer('return 0', return_tensors='pt')
          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo "return 0" passed

          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好！ 你好嗎？" > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
          tokenized = tokenizer('你好！ 你好嗎？', return_tensors='pt')
          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo "你好！ 你好嗎？" passed

          timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好！ 你好嗎？" > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
          prompts = [
            'Alan Turing was a',
            'return 0',
            '你好！ 你好嗎？'
          ]
          for prompt in prompts:
            tokenized = tokenizer(prompt, return_tensors='pt')
            for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
                ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
                idx = predictions.find(ref)
                if -1 == idx:
                    raise RuntimeError(f'Missing "{ref=}" from predictions')
                predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo "Multi prompt" passed

  cpp-greedy_causal_lm-windows:
    runs-on: windows-latest
    if: ${{ false }} # TODO: fix Windows
    env:
      PYTHONIOENCODING: "utf8"
    defaults:
      run:
        shell: cmd
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Configure Developer Command Prompt for Microsoft Visual C++
        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
      - run: curl --output ov.zip ${{ env.w_ov_link }}
      - run: unzip -d ov ov.zip
      - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
        shell: bash
      - name: Build app
        run: |
          call .\ov\setupvars.bat
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert model
        run: |
          call .\ov\setupvars.bat
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
      - run: >
          set PATH=.\build\openvino_genai\;%PATH%
          && call .\ov\setupvars.bat
          && .\build\samples\cpp\greedy_causal_lm\Release\greedy_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\cpp.txt
      - run: |
          echo import transformers > ref.py
          echo predictions = open('cpp.txt', 'r').read() >> ref.py
          echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py
          echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py
          echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py
          echo     ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
          echo     idx = predictions.find(ref) >> ref.py
          echo     if -1 == idx: >> ref.py
          echo         raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py
          echo     predictions = predictions[:idx] + predictions[idx + len(ref):] >> ref.py
      - run: python ref.py
      - run: >
          set PATH=.\build\openvino_genai\;%PATH%
          && set "PYTHONPATH=./build/"
          && call .\ov\setupvars.bat
          && python samples\python\greedy_causal_lm\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt
      - run: fc .\cpp.txt .\py.txt

  cpp-beam_search_causal_lm-Qwen-7B-Chat:
    runs-on: ubuntu-20.04-16-cores
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
      - run: >
          . ./ov/setupvars.sh
          && export PYTHONPATH=./build/:$PYTHONPATH
          && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -

  cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
    runs-on: ubuntu-20.04-16-cores
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
      - run: >
          . ./ov/setupvars.sh
          && export PYTHONPATH=./build/:$PYTHONPATH
          && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好！"
          | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好！") -

  cpp-beam_search_causal_lm-Phi-2:
    runs-on: ubuntu-20.04-16-cores
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
      - run: >
          . ./ov/setupvars.sh
          && export PYTHONPATH=./build/:$PYTHONPATH
          && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69
          | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) -

  cpp-beam_search_causal_lm-notus-7b-v1:
    runs-on: ubuntu-20.04-16-cores
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
      - run: >
          . ./ov/setupvars.sh
          && export PYTHONPATH=./build/:$PYTHONPATH
          && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69
          | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) -

  cpp-speculative_decoding_lm-ubuntu:
    runs-on: ubuntu-20.04-16-cores
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
      - name: run and compare
        run: |
          source ./ov/setupvars.sh
          ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
          python -c "
          with open('predictions_greedy.txt', 'r') as f:
              predicted_greedy = f.readline()
          with open('predictions_speculative.txt', 'r') as f:
              predicted_speculative = f.readline()
          assert predicted_greedy == predicted_speculative
          "
          echo "Alan Turing was a" passed

  cpp-prompt_lookup_decoding_lm-ubuntu:
    runs-on: ubuntu-20.04-16-cores
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat --task text-generation-with-past
      - name: run and compare
        run: |
          source ./ov/setupvars.sh

          echo 'Code:```python
          def add(a, b):
              return a + b
          ```
          Question: Can you please add 2 and 3
          A:' > ./prompt.txt

          ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
          python -c "
          with open('predictions_greedy.txt', 'r') as f:
              predicted_greedy = f.readline()
          with open('predictions_prompt_lookup.txt', 'r') as f:
              predicted_prompt_lookup = f.readline()
          assert predicted_greedy == predicted_prompt_lookup
          "
          echo "Prompt lookup" passed
      - name: run and compare (model with seq_length_axis = 1)
        run: |
          source ./ov/setupvars.sh

          echo 'Code:```python
          def add(a, b):
              return a + b
          ```
          Question: Can you please add 2 and 3
          A:' > ./prompt.txt

          ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./Qwen-7B-Chat/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ "$(<prompt.txt)" > predictions_greedy.txt
          
          python -c "
          with open('predictions_greedy.txt', 'r') as f:
              predicted_greedy = f.readline()
          with open('predictions_prompt_lookup.txt', 'r') as f:
              predicted_prompt_lookup = f.readline()
          assert predicted_greedy == predicted_prompt_lookup
          "
          echo "Prompt lookup" passed

  cpp-Phi-1_5:
    runs-on: ubuntu-20.04-16-cores
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
      - name: Run Generation
        run: |
          source ./ov/setupvars.sh
          timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
      - name: Compare
        run: |
          python -c "
          import transformers
          with open('pred_greedy.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
          tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
          for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
              ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo Phi-1_5 passed
      - run: >
          . ./ov/setupvars.sh
          && export PYTHONPATH=./build/:$PYTHONPATH
          && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a"
          | diff ./pred_greedy.txt -

  cpp-greedy_causal_lm-redpajama-3b-chat:
    runs-on: ubuntu-20.04-4-cores
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
      - name: Run Generation
        run: |
          source ./ov/setupvars.sh
          timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
      - name: Compare
        run: |
          python -c "
          import transformers
          with open('pred_greedy.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
          tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
          for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
              ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo "Alan Turing was a" passed
      - run: >
          . ./ov/setupvars.sh
          && export PYTHONPATH=./build/:$PYTHONPATH
          && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a"
          | diff ./pred_greedy.txt -

  cpp-chat_sample-ubuntu:
    runs-on: ubuntu-20.04
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
      - name: Compare
        run: |
          source ./ov/setupvars.sh
          printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt
          timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt
          python -c "
          from transformers import LlamaTokenizer, AutoModelForCausalLM
          model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
          tokenizer = LlamaTokenizer.from_pretrained(model_id)
          model = AutoModelForCausalLM.from_pretrained(model_id)
          prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?']
          def gen_prompt(prompt):
              return {'role': 'user', 'content': prompt}
          def gen_answer(answer):
              return {'role': 'assistant', 'content': answer}
          chat_history = []
          chat_prompt = ''
          output = open('ref.txt', 'w')
          for prompt in prompts:
              output.write('question:\n')
              chat_history.append(gen_prompt(prompt))
              chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
              tokenized = tokenizer(chat_prompt, return_tensors='pt')
              answer = model.generate(**tokenized, max_length=1000, do_sample=False)
              answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
              chat_history.append(gen_answer(answer_str))
              output.write(answer_str)
              output.write('\n----------\n')
          output.write('question:\n')
          output.close()
          "
          diff pred.txt ref.txt
          echo "Chat sample cpp" passed
          export PYTHONPATH=./build/:$PYTHONPATH
          timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt
          diff pred2.txt ref.txt
          echo "Chat sample python" passed

  cpp-continuous-batching-ubuntu:
    runs-on: ubuntu-20.04-8-cores
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
      - name: Run gtests
        run: |
          source ./ov/setupvars.sh
          ./build/tests/cpp/tests_continuous_batching
      - name: Run accuracy_sample
        run: |
          source ./ov/setupvars.sh
          timeout 50s ./build/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5
      - name: Run throughput_benchmark
        run: |
          wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
          source ./ov/setupvars.sh
          timeout 200s ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 10 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
          timeout 200s ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 10 --dynamic_split_fuse --max_batch_size 256 --max_input_len 256 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1

  cpp-continuous-batching-windows:
    runs-on: windows-latest
    if: ${{ false }} # TODO: fix Windows
    env:
      PYTHONIOENCODING: "utf8"
    defaults:
      run:
        shell: cmd
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Configure Developer Command Prompt for Microsoft Visual C++
        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
      - name: Install OpenVINO
        run: |
          curl --output ov.zip ${{ env.w_ov_link }}
          unzip -d ov ov.zip
          dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
        shell: bash
      - name: Build app
        run: |
          call .\ov\setupvars.bat
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          call .\ov\setupvars.bat
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
      - name: Run gtests
        run: |
          set PATH=.\build\openvino_genai\;%PATH%
          call .\ov\setupvars.bat
          .\build\tests\cpp\Release\tests_continuous_batching.exe
      - name: Run accuracy_sample
        run: |
          set PATH=.\build\openvino_genai\;%PATH%
          call .\ov\setupvars.bat
          .\build\samples\cpp\continuous_batching_accuracy\Release\continuous_batching_accuracy.exe -m .\TinyLlama-1.1B-Chat-v1.0\ -n 5
      - name: Run throughput_benchmark
        run: |
          curl -o .\ShareGPT_V3_unfiltered_cleaned_split.json -s -L "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
          set PATH=.\build\openvino_genai\;%PATH%
          call .\ov\setupvars.bat
          .\build\samples\cpp\continuous_batching_benchmark\Release\continuous_batching_benchmark.exe -n 2 -m .\TinyLlama-1.1B-Chat-v1.0\ --dataset .\ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1

  cpp-continuous-batching-macos:
    runs-on: macos-12
    defaults:
      run:
        shell: bash
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
          brew install coreutils scons
      - name: Build app
        run: |
          source ./ov/setupvars.sh
          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
          cmake --build ./build/ --config Release -j
      - name: Download and convert and model
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
      - name: Run gtests
        run: |
          source ./ov/setupvars.sh
          ./build/tests/cpp/tests_continuous_batching
      - name: Run accuracy_sample
        run: |
          source ./ov/setupvars.sh
          timeout 120s ./build/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5
      - name: Run throughput_benchmark
        run: |
          wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
          source ./ov/setupvars.sh
          ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 5 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1