Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into make-v1-testable
Browse files Browse the repository at this point in the history
  • Loading branch information
joerunde committed Nov 6, 2024
2 parents f706be4 + 098f94d commit 7f63da9
Show file tree
Hide file tree
Showing 206 changed files with 6,195 additions and 953 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@

def read_markdown(file):
if os.path.exists(file):
with open(file, "r") as f:
with open(file) as f:
return f.read() + "\n"
else:
return f"{file} not found.\n"
Expand All @@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving):
# collect results
for test_file in results_folder.glob("*.json"):

with open(test_file, "r") as f:
with open(test_file) as f:
raw_result = json.loads(f.read())

if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py`

# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)

Expand All @@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving):
# this result is generated via `benchmark_latency.py`

# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)

Expand All @@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving):
# this result is generated via `benchmark_throughput.py`

# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,15 @@ def main(args):

# collect results
for test_file in results_folder.glob("*_nightly_results.json"):
with open(test_file, "r") as f:
with open(test_file) as f:
results = results + json.loads(f.read())

# generate markdown table
df = pd.DataFrame.from_dict(results)

md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)

with open(args.description, "r") as f:
with open(args.description) as f:
description = f.read()

description = description.format(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
# collect results
for test_file in results_folder.glob("*.json"):

with open(test_file, "r") as f:
with open(test_file) as f:
raw_result = json.loads(f.read())

# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)

Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-openvino-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
4 changes: 2 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,14 @@ steps:
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language/test_models.py
- pytest -v -s models/decoder_only/language/test_big_models.py

- label: Decoder-only Language Models Test (Extended) # 1h20min
nightly: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py

- label: Decoder-only Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd]
Expand Down Expand Up @@ -511,6 +510,7 @@ steps:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
- pytest -v -s -x lora/test_mixtral.py

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/mypy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
fail-fast: false
matrix:
os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
python-version: ['3.9', '3.10', '3.11', '3.12']
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']

Expand Down
32 changes: 16 additions & 16 deletions .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,19 @@ jobs:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Analysing the code with ruff
run: |
echo "::add-matcher::.github/workflows/matchers/ruff.json"
ruff check --output-format github .
- name: Run isort
run: |
isort . --check-only
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Analysing the code with ruff
run: |
echo "::add-matcher::.github/workflows/matchers/ruff.json"
ruff check --output-format github .
- name: Run isort
run: |
isort . --check-only
26 changes: 13 additions & 13 deletions .github/workflows/yapf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,16 @@ jobs:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install yapf==0.32.0
pip install toml==0.10.2
- name: Running yapf
run: |
yapf --diff --recursive .
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install yapf==0.32.0
pip install toml==0.10.2
- name: Running yapf
run: |
yapf --diff --recursive .
11 changes: 5 additions & 6 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,16 @@ version: 2
build:
os: ubuntu-22.04
tools:
python: "3.8"
python: "3.12"

sphinx:
configuration: docs/source/conf.py
fail_on_warning: true
configuration: docs/source/conf.py
fail_on_warning: true

# If using Sphinx, optionally build your docs in additional formats such as PDF
formats: []

# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/requirements-docs.txt

install:
- requirements: docs/requirements-docs.txt
38 changes: 19 additions & 19 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")

# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
Expand Down Expand Up @@ -128,17 +128,17 @@ endif()

if(VLLM_GPU_LANG STREQUAL "CUDA")
#
# For cuda we want to be able to control which architectures we compile for on
# For cuda we want to be able to control which architectures we compile for on
# a per-file basis in order to cut down on compile time. So here we extract
# the set of architectures we want to compile for and remove the from the
# the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally.
#
clear_cuda_arches(CUDA_ARCH_FLAGS)
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS
cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else()
Expand Down Expand Up @@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS)
set(MARLIN_SRCS
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
Expand Down Expand Up @@ -277,15 +277,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"in CUDA target architectures")
endif()

# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
endif()

#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
Expand Down Expand Up @@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
#
# For the Machete kernels we automatically generate sources for various
# For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
set(MACHETE_GEN_SCRIPT
set(MACHETE_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)

Expand All @@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
Expand All @@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

if (NOT machete_generation_result EQUAL 0)
message(FATAL_ERROR "Machete generation failed."
" Result: \"${machete_generation_result}\""
" Result: \"${machete_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
else()
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
CACHE STRING "Last run machete generate script hash" FORCE)
message(STATUS "Machete generation completed successfully.")
endif()
Expand All @@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
AND MACHETE_ARCHS)
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
Expand All @@ -392,8 +392,8 @@ define_gpu_extension_target(
USE_SABI 3
WITH_SOABI)

# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# driver API. This causes problems when linking with earlier versions of CUDA.
# Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
Expand Down Expand Up @@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return()
endif ()

# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
Expand Down
16 changes: 16 additions & 0 deletions Dockerfile.hpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest

COPY ./ /workspace/vllm

WORKDIR /workspace/vllm

RUN pip install -v -r requirements-hpu.txt

ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install

WORKDIR /workspace/

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
11 changes: 11 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,14 @@ You can download the dataset by running:
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
```

## Downloading the ShareGPT4V dataset

The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
will ignore a datapoint if the referred image is missing.
```bash
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
mkdir coco -p
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
unzip coco/train2017.zip -d coco/
```
Loading

0 comments on commit 7f63da9

Please sign in to comment.