NVIDIA · janekl · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025
diff --git a/examples/llm/sft/hf_vllm.py b/examples/llm/sft/hf_vllm.py
@@ -42,7 +42,7 @@
         triton_model_name=args.triton_model_name,
         triton_model_version=1,
         max_batch_size=64,
-        port=8000,
+        http_port=8000,
         address="0.0.0.0",
     )
 

diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py
@@ -38,7 +38,7 @@ class DeployPyTriton(DeployBase):
             tensor_parallelism_size=1,
         )
 
-        nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", port=8000)
+        nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", http_port=8000)
         nm.deploy()
         nm.run()
         nq = NemoQueryLLM(url="localhost", model_name="model_name")

diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import tarfile
 from contextlib import nullcontext
 from typing import Callable, Optional
@@ -250,5 +251,6 @@ def export(self, model: MegatronGPTModel):
             if dist.get_rank() == 0:
                 save_artifacts(model, export_dir)
                 if compress:
-                    with tarfile.open(self.export_config.save_path, "w:gz") as tar:
+                    os.makedirs(os.path.dirname(self.export_config.save_path), exist_ok=True)
+                    with tarfile.open(self.export_config.save_path, "w") as tar:
                         tar.add(export_dir, arcname="./")
diff --git a/scripts/deploy/multimodal/deploy_triton.py b/scripts/deploy/multimodal/deploy_triton.py
@@ -212,7 +212,7 @@ def nemo_deploy(argv):
             triton_model_name=args.triton_model_name,
             triton_model_version=args.triton_model_version,
             max_batch_size=args.max_batch_size,
-            port=args.triton_port,
+            http_port=args.triton_port,
             address=args.triton_http_address,
         )
 

diff --git a/scripts/deploy/nlp/deploy_inframework_triton.py b/scripts/deploy/nlp/deploy_inframework_triton.py
@@ -91,7 +91,7 @@ def nemo_deploy(argv):
                 triton_model_name=args.triton_model_name,
                 triton_model_version=args.triton_model_version,
                 max_batch_size=args.max_batch_size,
-                port=args.triton_port,
+                http_port=args.triton_port,
                 address=args.triton_http_address,
             )
 

diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
@@ -411,7 +411,7 @@ def nemo_deploy(argv):
             triton_model_name=args.triton_model_name,
             triton_model_version=args.triton_model_version,
             max_batch_size=args.max_batch_size,
-            port=args.triton_port,
+            http_port=args.triton_port,
             address=args.triton_http_address,
             streaming=args.enable_streaming,
         )

diff --git a/scripts/deploy/nlp/deploy_vllm_triton.py b/scripts/deploy/nlp/deploy_vllm_triton.py
@@ -156,7 +156,7 @@ def nemo_deploy(argv):
             triton_model_name=args.triton_model_name,
             triton_model_version=args.triton_model_version,
             max_batch_size=args.max_batch_size,
-            port=args.triton_port,
+            http_port=args.triton_port,
             address=args.triton_http_address,
             streaming=args.enable_streaming,
         )

diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py
@@ -136,7 +136,7 @@ def run_in_framework_inference(
     nm = DeployPyTriton(
         model=model,
         triton_model_name=model_name,
-        port=8000,
+        http_port=8000,
     )
     nm.deploy()
     nm.run()
@@ -286,7 +286,7 @@ def run_trt_llm_inference(
             nm = DeployPyTriton(
                 model=trt_llm_exporter,
                 triton_model_name=model_name,
-                port=8000,
+                http_port=8000,
             )
             nm.deploy()
             nm.run()

diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py
@@ -398,7 +398,7 @@ def run_inference(
             nm = DeployPyTriton(
                 model=exporter,
                 triton_model_name=model_name,
-                port=8000,
+                http_port=8000,
             )
             nm.deploy()
             nm.run()
@@ -579,7 +579,7 @@ def run_in_framework_inference(
         nm = DeployPyTriton(
             model=deployed_model,
             triton_model_name=model_name,
-            port=8000,
+            http_port=8000,
         )
         nm.deploy()
         nm.run()