intel · xwu99 · Mar 7, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
@@ -85,7 +85,7 @@ jobs:
           docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379' --ray-debugger-external"
           CMD=$(cat << EOF
           import yaml
-          conf_path = "finetune/finetune.yaml"
+          conf_path = "llm_on_ray/finetune/finetune.yaml"
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)
               result['General']['base_model'] = "${{ matrix.model }}"
@@ -113,14 +113,14 @@ jobs:
           EOF
           )
           docker exec "finetune" python -c "$CMD"
-          docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml"
+          docker exec "finetune" bash -c "llm_on_ray-finetune  --config_file llm_on_ray/finetune/finetune.yaml"
 
       - name: Run PEFT-LoRA Test
         run: |
           docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
           CMD=$(cat << EOF
           import yaml
-          conf_path = "finetune/finetune.yaml"
+          conf_path = "llm_on_ray/finetune/finetune.yaml"
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)
               result['General']['lora_config'] = {
@@ -138,7 +138,7 @@ jobs:
           EOF
           )
           docker exec "finetune" python -c "$CMD"
-          docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml"
+          docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml"
 
       - name: Run Deltatuner Test on DENAS-LoRA Model
         run: |
@@ -150,7 +150,7 @@ jobs:
           import os
           import yaml
           os.system("cp -r $(python -m pip show deltatuner | grep Location | cut -d: -f2)/deltatuner/conf/best_structure examples/")
-          conf_path = "finetune/finetune.yaml"
+          conf_path = "llm_on_ray/finetune/finetune.yaml"
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)
               result['General']['lora_config'] = {
@@ -168,7 +168,7 @@ jobs:
               yaml.dump(result, output, sort_keys=False)
           EOF)
             docker exec "finetune" python -c "$CMD"
-            docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml"
+            docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml"
           fi
 
       - name: Stop Ray

diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -118,14 +118,14 @@ jobs:
           CMD=$(cat << EOF
           import yaml
           if ("${{ matrix.model }}" == "starcoder"):
-              conf_path = "inference/models/starcoder.yaml"
+              conf_path = "llm_on_ray/inference/models/starcoder.yaml"
               with open(conf_path, encoding="utf-8") as reader:
                   result = yaml.load(reader, Loader=yaml.FullLoader)
                   result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
               with open(conf_path, 'w') as output:
                   yaml.dump(result, output, sort_keys=False)
           if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
-              conf_path = "inference/models/llama-2-7b-chat-hf.yaml"
+              conf_path = "llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml"
               with open(conf_path, encoding="utf-8") as reader:
                   result = yaml.load(reader, Loader=yaml.FullLoader)
                   result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
@@ -135,11 +135,11 @@ jobs:
           )
           docker exec "${TARGET}" python -c "$CMD"
           if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --simple"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/bigdl/mpt-7b-bigdl.yaml --simple"
           elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml --simple"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file .github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml --simple"
           else
-            docker exec "${TARGET}" bash -c "python inference/serve.py --simple --models ${{ matrix.model }}"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --simple --models ${{ matrix.model }}"
           fi
           echo Non-streaming query:
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
@@ -150,7 +150,7 @@ jobs:
         if: ${{ matrix.dtuner_model }}
         run: |
           TARGET=${{steps.target.outputs.target}}
-          docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner.yaml --simple"
+          docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file .github/workflows/config/mpt_deltatuner.yaml --simple"
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
 
@@ -160,8 +160,8 @@ jobs:
           if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           elif [[ ! ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
-            docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed"
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file \"${{ matrix.model }}\".yaml.deepspeed --simple"
+            docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file llm_on_ray/inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file \"${{ matrix.model }}\".yaml.deepspeed --simple"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
           fi
@@ -173,7 +173,7 @@ jobs:
           if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
           fi
@@ -182,9 +182,9 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
-            docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/bigdl/mpt-7b-bigdl.yaml"
           elif [[ ! ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
-            docker exec "${TARGET}" bash -c "python inference/serve.py --models ${{ matrix.model }}"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --models ${{ matrix.model }}"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }}"
           fi
 

diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml
@@ -7,11 +7,11 @@ on:
     paths:
       - '.github/**'
       - 'docker/**'
-      - 'common/**'
       - 'dev/docker/**'
-      - 'finetune/**'
-      - 'inference/**'
-      - 'rlhf/**'
+      - 'llm_on_ray/common/**'
+      - 'llm_on_ray/finetune/**'
+      - 'llm_on_ray/inference/**'
+      - 'llm_on_ray/rlhf/**'
       - 'tools/**'
       - 'pyproject.toml'
       - 'tests/**'

diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml
@@ -7,11 +7,11 @@ on:
     paths:
       - '.github/**'
       - 'docker/**'
-      - 'common/**'
       - 'dev/docker/**'
-      - 'finetune/**'
-      - 'inference/**'
-      - 'rlhf/**'
+      - 'llm_on_ray/common/**'
+      - 'llm_on_ray/finetune/**'
+      - 'llm_on_ray/inference/**'
+      - 'llm_on_ray/rlhf/**'
       - 'tools/**'
       - 'pyproject.toml'
       - 'tests/**'

diff --git a/README.md b/README.md
@@ -62,14 +62,14 @@ ray start --head
 Use the following command to finetune a model using an example dataset and default configurations. The finetuned model will be stored in `/tmp/llm-ray/output` by default. To customize the base model, dataset and configurations, please see the [finetuning document](#finetune):
 
 ```bash
-python finetune/finetune.py --config_file finetune/finetune.yaml
+llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml
 ```
 
 ### Serving
 Deploy a model on Ray and expose an endpoint for serving. This command uses GPT2 as an example, but more model configuration examples can be found in the [inference/models](inference/models) directory:
 
 ```bash
-python inference/serve.py --config_file inference/models/gpt2.yaml
+llm_on_ray-serve --config_file llm_on_ray/inference/models/gpt2.yaml
 ```
 
 The default served method is to provide an OpenAI-compatible API server ([OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)), you can access and test it in many ways:
@@ -95,7 +95,7 @@ python examples/inference/api_server_openai/query_openai_sdk.py
 ```
 Or you can serve specific model to a simple endpoint according to the `port` and `route_prefix` parameters in configuration file,
 ```bash
-python inference/serve.py --config_file inference/models/gpt2.yaml --simple
+llm_on_ray-serve --config_file llm_on_ray/inference/models/gpt2.yaml --simple
 ```
 After deploying the model endpoint, you can access and test it by using the script below:
 ```bash

diff --git a/common/agentenv/__init__.py b/common/agentenv/__init__.py
diff --git a/common/dataprocesser/__init__.py b/common/dataprocesser/__init__.py
diff --git a/common/dataset/__init__.py b/common/dataset/__init__.py
diff --git a/common/initializer/__init__.py b/common/initializer/__init__.py
diff --git a/common/model/__init__.py b/common/model/__init__.py
diff --git a/common/optimizer/__init__.py b/common/optimizer/__init__.py
diff --git a/common/tokenizer/__init__.py b/common/tokenizer/__init__.py
diff --git a/common/trainer/__init__.py b/common/trainer/__init__.py
diff --git a/dev/docker/Dockerfile.bigdl-cpu b/dev/docker/Dockerfile.bigdl-cpu
@@ -27,7 +27,7 @@ RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
 COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
 
-RUN mkdir ./finetune && mkdir ./inference
+RUN mkdir ./llm_on_ray
 
 RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[bigdl-cpu] --extra-index-url https://download.pytorch.org/whl/cpu \
     --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/

diff --git a/dev/docker/Dockerfile.cpu_and_deepspeed b/dev/docker/Dockerfile.cpu_and_deepspeed
@@ -27,7 +27,7 @@ RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
 COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
 
-RUN mkdir ./finetune && mkdir ./inference
+RUN mkdir ./llm_on_ray
 
 RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu,deepspeed] --extra-index-url https://download.pytorch.org/whl/cpu \
     --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/

diff --git a/dev/docker/Dockerfile.vllm b/dev/docker/Dockerfile.vllm
@@ -22,13 +22,13 @@ RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
     unset -f conda && \
     export PATH=$CONDA_DIR/bin/:${PATH} && \
     conda config --add channels intel && \
-    conda install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3
+    conda install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt
 
 COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
 COPY ./dev/scripts/install-vllm-cpu.sh .
 
-RUN mkdir ./finetune && mkdir ./inference
+RUN mkdir ./llm_on_ray
 
 RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu] --extra-index-url https://download.pytorch.org/whl/cpu \
     --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/

diff --git a/docs/finetune.md b/docs/finetune.md
@@ -65,5 +65,5 @@ The following models have been verified on Intel CPUs or GPUs.
 ## Finetune the model
 To finetune your model, execute the following command. The finetuned model will be saved in /tmp/llm-ray/output by default.
 ``` bash
-python finetune/finetune.py --config_file <your finetuning conf file>
+llm_on_ray-finetune --config_file <your finetuning conf file>
 ```
diff --git a/docs/pretrain.md b/docs/pretrain.md
@@ -123,27 +123,27 @@ Set up `megatron_deepspeed_path` in the configuration.
 ```bash
 cd /home/user/workspace/llm-on-ray
 #Bloom-7B
-python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
+llm_on_ray-megatron_deepspeed_pretrain --config_file llm_on_ray/pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
 #llama-7B
-python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
+llm_on_ray-megatron_deepspeed_pretrain --config_file llm_on_ray/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
 ```
 
 ##### Huggingface Trainer
 ```bash
 cd /home/user/workspace/llm-on-ray
 #llama-7B
-python pretrain/pretrain.py --config_file pretrain/config/llama_7b_8Guadi_pretrain.conf
+llm_on_ray-pretrain --config_file llm_on_ray/pretrain/config/llama_7b_8Guadi_pretrain.conf
 ```
 ##### Nvidia GPU:
 ###### Megatron-DeepSpeed
 ```bash
 cd /home/user/workspace/llm-on-ray
 #llama2-7B
-python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/llama2_3b_megatron_deepspeed_zs0_8gpus_pretrain.conf
+llm_on_ray-megatron_deepspeed_pretrain --config_file llm_on_ray/pretrain/config/llama2_3b_megatron_deepspeed_zs0_8gpus_pretrain.conf
 ```
 ##### Huggingface Trainer
 ```bash
 cd /home/user/workspace/llm-on-ray
 #llama-7B
-python pretrain/pretrain.py --config_file pretrain/config/llama_7b_8gpu_pretrain.conf
+llm_on_ray-pretrain --config_file llm_on_ray/pretrain/config/llama_7b_8gpu_pretrain.conf
 ```
diff --git a/docs/serve.md b/docs/serve.md
@@ -30,22 +30,22 @@ LLM-on-Ray also supports serving with [Deepspeed](serve_deepspeed.md) for AutoTP
 We support three methods to specify the models to be served, and they have the following priorities.
 1. Use inference configuration file if config_file is set.
 ```
-python inference/serve.py --config_file inference/models/gpt2.yaml
+llm_on_ray-serve --config_file llm_on_ray/inference/models/gpt2.yaml
 ```
 2. Use relevant configuration parameters if model_id_or_path is set.
 ```
-python inference/serve.py --model_id_or_path gpt2 [--tokenizer_id_or_path gpt2 --port 8000 --route_prefix ...]
+llm_on_ray-serve --model_id_or_path gpt2 [--tokenizer_id_or_path gpt2 --port 8000 --route_prefix ...]
 ```
 3. If --config_file and --model_id_or_path are both None, it will serve all pre-defined models in inference/models/*.yaml, or part of them if models is set.
 ```
-python inference/serve.py --models gpt2 gpt-j-6b
+llm_on_ray-serve --models gpt2 gpt-j-6b
 ```
 ### OpenAI-compatible API
 To deploy your model, execute the following command with the model's configuration file. This will create an OpenAI-compatible API ([OpenAI API Reference](https://platform.openai.com/docs/api-reference/chat)) for serving.
 ```bash
-python inference/serve.py --config_file <path to the conf file>
+llm_on_ray-serve --config_file <path to the conf file>
 ```
-To deploy and serve multiple models concurrently, place all models' configuration files under `inference/models` and directly run `python inference/serve.py` without passing any conf file.
+To deploy and serve multiple models concurrently, place all models' configuration files under `llm_on_ray/inference/models` and directly run `llm_on_ray-serve` without passing any conf file.
 
 After deploying the model, you can access and test it in many ways:
 ```bash
@@ -71,7 +71,7 @@ python examples/inference/api_server_openai/query_openai_sdk.py
 ### Serving Model to a Simple Endpoint
 This will create a simple endpoint for serving according to the `port` and `route_prefix` parameters in conf file, for example: http://127.0.0.1:8000/gpt2.
 ```bash
-python inference/serve.py --config_file <path to the conf file> --simple
+llm_on_ray-serve --config_file <path to the conf file> --simple
 ```
 After deploying the model endpoint, you can access and test it by using the script below:
 ```bash

diff --git a/docs/vllm.md b/docs/vllm.md
@@ -23,7 +23,7 @@ Please follow [Deploying and Serving LLMs on Intel CPU/GPU/Gaudi](serve.md) docu
 To serve model with vLLM, run the following:
 
 ```bash
-$ python serve.py --config_file inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml --simple --keep_serve_terminal
+$ llm_on_ray-serve --config_file llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml --simple --keep_serve_terminal
 ```
 
 In the above example, `vllm` property is set to `true` in the config file for enabling vLLM.

diff --git a/docs/web_ui.md b/docs/web_ui.md
@@ -14,7 +14,7 @@ $ dev/scripts/install-ui.sh
 ## Start Web UI
 
 ```bash
-python -u ui/start_ui.py --node_user_name $user --conda_env_name $conda_env --master_ip_port "$node_ip:6379"
+python -m llm_on_ray.ui.start_ui --node_user_name $user --conda_env_name $conda_env --master_ip_port "$node_ip:6379"
 ```
 You will get URL from the command line output (E.g. http://0.0.0.0:8080 for local network and https://180cd5f7c31a1cfd3c.gradio.live for public network) and use the web browser to open it.