Skip to content

Commit

Permalink
fix: Fix the CUDAGraphs C++ runtime implementation
Browse files Browse the repository at this point in the history
Signed-off-by: Naren Dasan <[email protected]>
Signed-off-by: Naren Dasan <[email protected]>
  • Loading branch information
Naren Dasan committed Aug 9, 2024
1 parent 655ed6b commit 96810f3
Show file tree
Hide file tree
Showing 28 changed files with 932 additions and 571 deletions.
64 changes: 47 additions & 17 deletions .github/workflows/build-test-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ on:
- nightly
- release/*
tags:
# NOTE: Binary build pipelines should only get triggered on release candidate builds
# Release candidate tags look like: v1.11.0-rc1
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
# NOTE: Binary build pipelines should only get triggered on release candidate builds
# Release candidate tags look like: v1.11.0-rc1
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
workflow_dispatch:

jobs:
Expand Down Expand Up @@ -84,9 +84,9 @@ jobs:
popd
pushd .
cd tests/py/ts
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
popd
tests-py-dynamo-converters:
Expand Down Expand Up @@ -114,7 +114,7 @@ jobs:
export USE_HOST_DEPS=1
pushd .
cd tests/py/dynamo
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 10 conversion/
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 8 conversion/
popd
tests-py-dynamo-fe:
Expand Down Expand Up @@ -142,8 +142,8 @@ jobs:
export USE_HOST_DEPS=1
pushd .
cd tests/py/dynamo
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_fe_test_results.xml --ir dynamo models/test_models_export.py
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/test_dyn_models.py
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_fe_test_results.xml --ir dynamo models/test_models_export.py
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/test_dyn_models.py
popd
tests-py-dynamo-serde:
Expand Down Expand Up @@ -171,7 +171,7 @@ jobs:
export USE_HOST_DEPS=1
pushd .
cd tests/py/dynamo
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
popd
tests-py-torch-compile-be:
Expand Down Expand Up @@ -199,9 +199,9 @@ jobs:
export USE_HOST_DEPS=1
pushd .
cd tests/py/dynamo
python -m pytest -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
popd
tests-py-dynamo-core:
Expand Down Expand Up @@ -229,9 +229,39 @@ jobs:
export USE_HOST_DEPS=1
pushd .
cd tests/py/dynamo
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
popd
tests-py-dynamo-cudagraphs:
name: Test dynamo cudagraphs [Python]
needs: [generate-matrix, build]
strategy:
fail-fast: false
matrix:
include:
- repository: pytorch/tensorrt
package-name: torch_tensorrt
pre-script: packaging/pre_build_script.sh
post-script: packaging/post_build_script.sh
smoke-test-script: packaging/smoke_test_script.sh
uses: ./.github/workflows/linux-test.yml
with:
job-name: tests-py-dynamo-cudagraphs
repository: "pytorch/tensorrt"
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
pre-script: ${{ matrix.pre-script }}
script: |
export USE_HOST_DEPS=1
pushd .
cd tests/py/dynamo
nvidia-smi
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py || true
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py || true
popd
tests-py-core:
Expand Down Expand Up @@ -259,7 +289,7 @@ jobs:
export USE_HOST_DEPS=1
pushd .
cd tests/py/core
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
popd
concurrency:
Expand Down
47 changes: 38 additions & 9 deletions .github/workflows/build-test-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ on:
- nightly
- release/*
tags:
# NOTE: Binary build pipelines should only get triggered on release candidate builds
# Release candidate tags look like: v1.11.0-rc1
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
# NOTE: Binary build pipelines should only get triggered on release candidate builds
# Release candidate tags look like: v1.11.0-rc1
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
workflow_dispatch:

jobs:
Expand Down Expand Up @@ -192,8 +192,8 @@ jobs:
export USE_HOST_DEPS=1
pushd .
cd tests/py/dynamo
python -m pytest -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
popd
Expand All @@ -219,9 +219,38 @@ jobs:
export USE_HOST_DEPS=1
pushd .
cd tests/py/dynamo
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
popd
tests-py-dynamo-cudagraphs:
name: Test dynamo cudagraphs [Python]
needs: [generate-matrix, build]
strategy:
fail-fast: false
matrix:
include:
- repository: pytorch/tensorrt
package-name: torch_tensorrt
pre-script: packaging/pre_build_script.sh
post-script: packaging/post_build_script.sh
smoke-test-script: packaging/smoke_test_script.sh
uses: ./.github/workflows/linux-test.yml
with:
job-name: tests-py-dynamo-cudagraphs
repository: "pytorch/tensorrt"
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
pre-script: ${{ matrix.pre-script }}
script: |
export USE_HOST_DEPS=1
pushd .
cd tests/py/dynamo
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py
#python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py
popd
tests-py-core:
Expand All @@ -246,7 +275,7 @@ jobs:
export USE_HOST_DEPS=1
pushd .
cd tests/py/core
python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
popd
concurrency:
Expand Down
4 changes: 2 additions & 2 deletions core/conversion/var/Var.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ bool Var::isITensorList() {
// Unpack the Var as a List and check if each entry is a custom class since
// ITensors are stored in CustomClassHolder
auto ival_list = ptr_.ivalue->toList();
for (int i = 0; i < ival_list.size(); i++) {
for (size_t i = 0; i < ival_list.size(); i++) {
if (!ival_list.get(i).isCustomClass()) {
return false;
}
Expand All @@ -167,7 +167,7 @@ std::vector<nvinfer1::ITensor*> Var::unwrapToITensorList() {
TORCHTRT_CHECK(isITensorList(), "Expected IValue to be an ITensorList");
auto ivalue_list = ptr_.ivalue->toList();
std::vector<nvinfer1::ITensor*> outputs;
for (int i = 0; i < ivalue_list.size(); i++) {
for (size_t i = 0; i < ivalue_list.size(); i++) {
auto element = ivalue_list.get(i).toCustomClass<TensorContainer>()->tensor();
outputs.push_back(std::move(element));
}
Expand Down
11 changes: 2 additions & 9 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,6 @@ TRTEngine::TRTEngine(
multi_gpu_device_check();
set_rt_device(device_info);

// Set active stream to non-default stream
auto current_stream = c10::cuda::getCurrentCUDAStream(device_info.id);
if (current_stream == c10::cuda::getDefaultCUDAStream(device_info.id)) {
active_stream = c10::cuda::getStreamFromPool(false, device_info.id);
c10::cuda::setCurrentCUDAStream(active_stream);
} else {
active_stream = current_stream;
}

rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));

name = slugify(mod_name);
Expand Down Expand Up @@ -205,6 +196,7 @@ TRTEngine::TRTEngine(
}

TRTEngine::~TRTEngine() {
cudagraph.reset();
trt_engine_profiler.reset();
exec_ctx.reset();
cuda_engine.reset();
Expand Down Expand Up @@ -253,6 +245,7 @@ void TRTEngine::set_profiling_paths() {
enqueue_profile_path = std::filesystem::path{profile_path_prefix + "/" + name + "_enqueue_profile.trace"}.string();
trt_engine_profile_path =
std::filesystem::path{profile_path_prefix + "/" + name + "_engine_exectuion_profile.trace"}.string();
cuda_graph_debug_path = std::filesystem::path{profile_path_prefix + "/" + name + "_cudagraph.dot"}.string();
}

std::string TRTEngine::to_str() const {
Expand Down
4 changes: 3 additions & 1 deletion core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ struct TRTEngine : torch::CustomClassHolder {

// CUDAGraph-Related Functionality
at::cuda::CUDAGraph cudagraph = {};
at::cuda::CUDAStream active_stream = c10::cuda::getDefaultCUDAStream();
at::cuda::CUDAStream engine_stream = c10::cuda::getDefaultCUDAStream();
at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
std::vector<at::Tensor> input_buffers = {};
std::vector<at::Tensor> output_buffers = {};
std::string shape_key;
Expand All @@ -89,6 +90,7 @@ struct TRTEngine : torch::CustomClassHolder {
std::string output_profile_path;
std::string enqueue_profile_path;
std::string trt_engine_profile_path;
std::string cuda_graph_debug_path;
std::mutex mu;
std::unique_ptr<TRTEngineProfiler> trt_engine_profiler;
};
Expand Down
Loading

0 comments on commit 96810f3

Please sign in to comment.