From fdae5abd150a46d597a832adac45f16581b4fca8 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Thu, 8 Aug 2024 15:48:51 -0400
Subject: [PATCH 01/10] add Completions API support

---
 .../docs/llms/nvidia_ai_endpoints.ipynb       | 222 +++++++++++++++++
 .../langchain_nvidia_ai_endpoints/__init__.py |  10 +-
 .../langchain_nvidia_ai_endpoints/_statics.py |  32 ++-
 .../langchain_nvidia_ai_endpoints/llm.py      | 230 ++++++++++++++++++
 .../tests/integration_tests/conftest.py       |  28 ++-
 .../tests/integration_tests/test_base_url.py  |   1 +
 .../test_completions_models.py                | 150 ++++++++++++
 .../integration_tests/test_register_model.py  |   6 +
 .../ai-endpoints/tests/unit_tests/conftest.py |   8 +-
 .../unit_tests/test_completions_models.py     | 149 ++++++++++++
 .../tests/unit_tests/test_imports.py          |   1 +
 .../tests/unit_tests/test_register_model.py   |  26 ++
 12 files changed, 848 insertions(+), 15 deletions(-)
 create mode 100644 libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb
 create mode 100644 libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
 create mode 100644 libs/ai-endpoints/tests/integration_tests/test_completions_models.py
 create mode 100644 libs/ai-endpoints/tests/unit_tests/test_completions_models.py

diff --git a/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb b/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb
new file mode 100644
index 00000000..c301fb8e
--- /dev/null
+++ b/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb
@@ -0,0 +1,222 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# NVIDIA NIMs\n",
+    "\n",
+    ":::caution\n",
+    "You are currently on a page documenting the use of models as [text completion models](/docs/concepts/#llms).\n",
+    "Many popular models are [chat completion models](/docs/concepts/#chat-models).\n",
+    "\n",
+    "To use chat completion models, use [ChatNVIDA](/docs/integrations/chat/nvidia_ai_endpoints/) instead.\n",
+    ":::\n",
+    "\n",
+    "The `langchain-nvidia-ai-endpoints` package contains LangChain integrations building applications with models on \n",
+    "NVIDIA NIM inference microservice. NIM supports models across domains like chat, completion, embedding, and re-ranking models \n",
+    "from the community as well as NVIDIA. These models are optimized by NVIDIA to deliver the best performance on NVIDIA \n",
+    "accelerated infrastructure and deployed as a NIM, an easy-to-use, prebuilt containers that deploy anywhere using a single \n",
+    "command on NVIDIA accelerated infrastructure.\n",
+    "\n",
+    "NVIDIA hosted deployments of NIMs are available to test on the [NVIDIA API catalog](https://build.nvidia.com/). After testing, \n",
+    "NIMs can be exported from NVIDIA’s API catalog using the NVIDIA AI Enterprise license and run on-premises or in the cloud, \n",
+    "giving enterprises ownership and full control of their IP and AI application.\n",
+    "\n",
+    "NIMs are packaged as container images on a per model basis and are distributed as NGC container images through the NVIDIA NGC Catalog. \n",
+    "At their core, NIMs provide easy, consistent, and familiar APIs for running inference on an AI model.\n",
+    "\n",
+    "This example goes over how to use LangChain to interact with NVIDIA supported via the `NVIDIA` class.\n",
+    "\n",
+    "For more information on accessing the completion models through this api, check out the [NVIDIA](https://python.langchain.com/docs/integrations/llms/nvidia_ai_endpoints/) documentation.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Installation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#%pip install -qU langchain-nvidia-ai-endpoints"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "**To get started:**\n",
+    "\n",
+    "1. Create a free account with [NVIDIA](https://build.nvidia.com/), which hosts NVIDIA AI Foundation models.\n",
+    "\n",
+    "2. Click on your model of choice.\n",
+    "\n",
+    "3. Under `Input` select the `Python` tab, and click `Get API Key`. Then click `Generate Key`.\n",
+    "\n",
+    "4. Copy and save the generated key as `NVIDIA_API_KEY`. From there, you should have access to the endpoints."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from getpass import getpass\n",
+    "\n",
+    "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset\n",
+    "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
+    "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+    "else:\n",
+    "    candidate_api_key = getpass(\"NVAPI Key (starts with nvapi-): \")\n",
+    "    assert candidate_api_key.startswith(\"nvapi-\"), f\"{candidate_api_key[:5]}... is not a valid key\"\n",
+    "    os.environ[\"NVIDIA_API_KEY\"] = candidate_api_key"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage\n",
+    "\n",
+    "See [LLM](/docs/how_to#llms) for full functionality."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_nvidia_ai_endpoints import NVIDIA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = NVIDIA().bind(max_tokens=256)\n",
+    "llm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"# Function that does quicksort written in Rust without comments:\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(llm.invoke(prompt))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for chunk in llm.stream(prompt):\n",
+    "    print(chunk, end=\"\", flush=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.batch([prompt])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "await llm.ainvoke(prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async for chunk in llm.astream(prompt):\n",
+    "    print(chunk, end=\"\", flush=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "await llm.abatch([prompt])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async for chunk in llm.astream_log(prompt):\n",
+    "    print(chunk)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = llm.invoke(\n",
+    "    \"X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.1) #Train a logistic regression model, predict the labels on the test set and compute the accuracy score\"\n",
+    ")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "langchain-nvidia-ai-endpoints-m0-Y4aGr-py3.10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/__init__.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/__init__.py
index d5796e3c..bfda8d36 100644
--- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/__init__.py
+++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/__init__.py
@@ -42,6 +42,14 @@
 from langchain_nvidia_ai_endpoints._statics import Model, register_model
 from langchain_nvidia_ai_endpoints.chat_models import ChatNVIDIA
 from langchain_nvidia_ai_endpoints.embeddings import NVIDIAEmbeddings
+from langchain_nvidia_ai_endpoints.llm import NVIDIA
 from langchain_nvidia_ai_endpoints.reranking import NVIDIARerank
 
-__all__ = ["ChatNVIDIA", "NVIDIAEmbeddings", "NVIDIARerank", "register_model", "Model"]
+__all__ = [
+    "ChatNVIDIA",
+    "NVIDIA",
+    "NVIDIAEmbeddings",
+    "NVIDIARerank",
+    "register_model",
+    "Model",
+]
diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py
index 52e29679..a2992523 100644
--- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py
+++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py
@@ -10,8 +10,8 @@ class Model(BaseModel):
     Model information.
 
     id: unique identifier for the model, passed as model parameter for requests
-    model_type: API type (chat, vlm, embedding, ranking, completion)
-    client: client name, e.g. ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank
+    model_type: API type (chat, vlm, embedding, ranking, completions)
+    client: client name, e.g. ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank, NVIDIA
     endpoint: custom endpoint for the model
     aliases: list of aliases for the model
     supports_tools: whether the model supports tool calling
@@ -23,9 +23,11 @@ class Model(BaseModel):
     id: str
     # why do we have a model_type? because ChatNVIDIA can speak both chat and vlm.
     model_type: Optional[
-        Literal["chat", "vlm", "embedding", "ranking", "completion", "qa"]
+        Literal["chat", "vlm", "embedding", "ranking", "completions", "qa"]
+    ] = None
+    client: Optional[
+        Literal["ChatNVIDIA", "NVIDIAEmbeddings", "NVIDIARerank", "NVIDIA"]
     ] = None
-    client: Optional[Literal["ChatNVIDIA", "NVIDIAEmbeddings", "NVIDIARerank"]] = None
     endpoint: Optional[str] = None
     aliases: Optional[list] = None
     supports_tools: Optional[bool] = False
@@ -42,6 +44,7 @@ def validate_client(cls, client: str, values: dict) -> str:
                 "ChatNVIDIA": ("chat", "vlm", "qa"),
                 "NVIDIAEmbeddings": ("embedding",),
                 "NVIDIARerank": ("ranking",),
+                "NVIDIA": ("completions",),
             }
             model_type = values.get("model_type")
             if model_type not in supported[client]:
@@ -491,14 +494,18 @@ def validate_client(cls, client: str, values: dict) -> str:
     ),
 }
 
-# COMPLETION_MODEL_TABLE = {
-#     "mistralai/mixtral-8x22b-v0.1": Model(
-#         id="mistralai/mixtral-8x22b-v0.1",
-#         model_type="completion",
-#         client="NVIDIA",
-#         aliases=["ai-mixtral-8x22b"],
-#     ),
-# }
+COMPLETION_MODEL_TABLE = {
+    "bigcode/starcoder2-7b": Model(
+        id="bigcode/starcoder2-7b",
+        model_type="completions",
+        client="NVIDIA",
+    ),
+    "bigcode/starcoder2-15b": Model(
+        id="bigcode/starcoder2-15b",
+        model_type="completions",
+        client="NVIDIA",
+    ),
+}
 
 
 OPENAI_MODEL_TABLE = {
@@ -518,6 +525,7 @@ def validate_client(cls, client: str, values: dict) -> str:
     **VLM_MODEL_TABLE,
     **EMBEDDING_MODEL_TABLE,
     **RANKING_MODEL_TABLE,
+    **COMPLETION_MODEL_TABLE,
 }
 
 if "_INCLUDE_OPENAI" in os.environ:
diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
new file mode 100644
index 00000000..94d23cd4
--- /dev/null
+++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
@@ -0,0 +1,230 @@
+from __future__ import annotations
+
+import os
+import warnings
+from typing import Any, Dict, Iterator, List, Optional
+
+from langchain_core.callbacks.manager import CallbackManagerForLLMRun
+from langchain_core.language_models.llms import LLM
+from langchain_core.outputs import GenerationChunk
+from langchain_core.pydantic_v1 import Field, PrivateAttr, root_validator
+
+from langchain_nvidia_ai_endpoints._common import _NVIDIAClient
+from langchain_nvidia_ai_endpoints._statics import Model
+
+
+class NVIDIA(LLM):
+    """
+    LangChain LLM that uses the Completions API with NVIDIA NIMs.
+    """
+
+    class Config:
+        validate_assignment = True
+
+    _client: _NVIDIAClient = PrivateAttr(_NVIDIAClient)
+    _default_model_name: str = "bigcode/starcoder2-7b"
+    _default_base_url: str = "https://integrate.api.nvidia.com/v1"
+    base_url: str = Field(
+        description="Base url for model listing and invocation",
+    )
+    model: Optional[str] = Field(description="The model to use for completions.")
+
+    _base_url_var = "NVIDIA_BASE_URL"
+
+    _init_args: Dict[str, Any] = PrivateAttr()
+    """Stashed arguments given to the constructor that can be passed to
+    the Completions API endpoint."""
+
+    @root_validator(pre=True)
+    def _validate_base_url(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        values["base_url"] = (
+            values.get(cls._base_url_var.lower())
+            or values.get("base_url")
+            or os.getenv(cls._base_url_var)
+            or cls._default_base_url
+        )
+        return values
+
+    def __check_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Check kwargs, warn for unknown keys, and return a copy recognized keys.
+        """
+        completions_arguments = {
+            "frequency_penalty",
+            "max_tokens",
+            "presence_penalty",
+            "seed",
+            "stop",
+            "temperature",
+            "top_p",
+            "best_of",
+            "echo",
+            "logit_bias",
+            "logprobs",
+            "n",
+            "suffix",
+            "user",
+            "stream",
+        }
+
+        recognized_kwargs = {
+            k: v for k, v in kwargs.items() if k in completions_arguments
+        }
+        unrecognized_kwargs = set(kwargs) - completions_arguments
+        if len(unrecognized_kwargs) > 0:
+            warnings.warn(f"Unrecognized, ignored arguments: {unrecognized_kwargs}")
+
+        return recognized_kwargs
+
+    def __init__(self, **kwargs: Any):
+        """
+        Create a new NVIDIA LLM for Completions APIs.
+
+        This class provides access to a NVIDIA NIM for completions. By default, it
+        connects to a hosted NIM, but can be configured to connect to a local NIM
+        using the `base_url` parameter. An API key is required to connect to the
+        hosted NIM.
+
+        Args:
+            model (str): The model to use for reranking.
+            nvidia_api_key (str): The API key to use for connecting to the hosted NIM.
+            api_key (str): Alternative to nvidia_api_key.
+            base_url (str): The base URL of the NIM to connect to.
+
+        API Key:
+        - The recommended way to provide the API key is through the `NVIDIA_API_KEY`
+            environment variable.
+
+        Additional arguments that can be passed to the Completions API:
+        - max_tokens (int): The maximum number of tokens to generate.
+        - stop (str or List[str]): The stop sequence to use for generating completions.
+        - temperature (float): The temperature to use for generating completions.
+        - top_p (float): The top-p value to use for generating completions.
+        - frequency_penalty (float): The frequency penalty to apply to the completion.
+        - presence_penalty (float): The presence penalty to apply to the completion.
+        - seed (int): The seed to use for generating completions.
+        - best_of (int): The number of completions to generate and return the best of.
+        - echo (bool): Whether to echo the prompt in the completion.
+        - logit_bias (Dict[str, float]): The logit bias to apply to the completion.
+        - logprobs (int): The number of logprobs to return.
+        - n (int): The number of completions to generate.
+        - suffix (str): The suffix to use for generating completions.
+        - user (str): The user ID to use for generating completions.
+
+        These additional arguments can also be passed with `bind()`, e.g.
+        `NVIDIA().bind(max_tokens=512)`, or pass directly to `invoke()` or `stream()`,
+        e.g. `NVIDIA().invoke("prompt", max_tokens=512)`.
+        """
+        super().__init__(**kwargs)
+        self._client = _NVIDIAClient(
+            base_url=self.base_url,
+            model_name=self.model,
+            default_hosted_model_name=self._default_model_name,
+            api_key=kwargs.pop("nvidia_api_key", kwargs.pop("api_key", None)),
+            infer_path="{base_url}/completions",
+            cls=self.__class__.__name__,
+        )
+        # todo: only store the model in one place
+        # the model may be updated to a newer name during initialization
+        self.model = self._client.model_name
+
+        # stash all additional args that can be passed to the Completions API,
+        # but first make sure we pull out any args that are processed elsewhere.
+        for key in [
+            "model",
+            "nvidia_base_url",
+            "base_url",
+        ]:
+            if key in kwargs:
+                del kwargs[key]
+        self._init_args = self.__check_kwargs(kwargs)
+
+    @property
+    def available_models(self) -> List[Model]:
+        """
+        Get a list of available models that work with NVIDIARerank.
+        """
+        return self._client.get_available_models(self.__class__.__name__)
+
+    @classmethod
+    def get_available_models(
+        cls,
+        **kwargs: Any,
+    ) -> List[Model]:
+        """
+        Get a list of available models that work with the Completions API.
+        """
+        return cls(**kwargs).available_models
+
+    @property
+    def _llm_type(self) -> str:
+        """
+        Get the type of language model used by this chat model.
+        Used for logging purposes only.
+        """
+        return "NVIDIA"
+
+    def _call(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        payload: Dict[str, Any] = {
+            "model": self.model,
+            "prompt": prompt,
+            **self._init_args,
+            **self.__check_kwargs(kwargs),
+        }
+        if stop:
+            payload["stop"] = stop
+
+        if payload.get("stream", False):
+            warnings.warn("stream set to true for non-streaming call, ignoring")
+            del payload["stream"]
+
+        response = self._client.get_req(payload=payload)
+        response.raise_for_status()
+
+        # todo: handle response's usage and system_fingerprint
+
+        choices = response.json()["choices"]
+        # todo: write a test for this by setting n > 1 on the request
+        #       aug 2024: n > 1 is not supported by endpoints
+        if len(choices) > 1:
+            warnings.warn(
+                f"Multiple choices in response, returning only the first: {choices}"
+            )
+
+        return choices[0]["text"]
+
+    def _stream(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> Iterator[GenerationChunk]:
+        payload: Dict[str, Any] = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": True,
+            **self._init_args,
+            **self.__check_kwargs(kwargs),
+        }
+        if stop:
+            payload["stop"] = stop
+
+        # we construct payload w/ **kwargs positioned to override stream=True,
+        # this lets us know if a user passed stream=False
+        if not payload.get("stream", True):
+            warnings.warn("stream set to false for streaming call, ignoring")
+            payload["stream"] = True
+
+        for chunk in self._client.get_req_stream(payload=payload):
+            content = chunk["content"]
+            generation = GenerationChunk(text=content)
+            if run_manager:  # todo: add tests for run_manager
+                run_manager.on_llm_new_token(content, chunk=generation)
+            yield generation
diff --git a/libs/ai-endpoints/tests/integration_tests/conftest.py b/libs/ai-endpoints/tests/integration_tests/conftest.py
index f1dd58f6..e89d2a7d 100644
--- a/libs/ai-endpoints/tests/integration_tests/conftest.py
+++ b/libs/ai-endpoints/tests/integration_tests/conftest.py
@@ -3,7 +3,12 @@
 import pytest
 from langchain_core.documents import Document
 
-from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank
+from langchain_nvidia_ai_endpoints import (
+    NVIDIA,
+    ChatNVIDIA,
+    NVIDIAEmbeddings,
+    NVIDIARerank,
+)
 from langchain_nvidia_ai_endpoints._statics import MODEL_TABLE, Model
 
 
@@ -39,6 +44,12 @@ def pytest_addoption(parser: pytest.Parser) -> None:
         nargs="+",
         help="Run tests for a specific qa model or list of models",
     )
+    parser.addoption(
+        "--completions-model-id",
+        action="store",
+        nargs="+",
+        help="Run tests for a specific completions model or list of models",
+    )
     parser.addoption(
         "--embedding-model-id",
         action="store",
@@ -98,6 +109,18 @@ def get_all_known_models() -> List[Model]:
             ]
         metafunc.parametrize("tool_model", models, ids=models)
 
+    if "completions_model" in metafunc.fixturenames:
+        models = [NVIDIA._default_model_name]
+        if model_list := metafunc.config.getoption("completions_model_id"):
+            models = model_list
+        if metafunc.config.getoption("all_models"):
+            models = [
+                model.id
+                for model in NVIDIA(**mode).available_models
+                if model.model_type == "completions"
+            ]
+        metafunc.parametrize("completions_model", models, ids=models)
+
     if "structured_model" in metafunc.fixturenames:
         models = ["meta/llama-3.1-8b-instruct"]
         if model_list := metafunc.config.getoption("structured_model_id"):
@@ -163,6 +186,7 @@ def mode(request: pytest.FixtureRequest) -> dict:
         ChatNVIDIA,
         NVIDIAEmbeddings,
         NVIDIARerank,
+        NVIDIA,
     ]
 )
 def public_class(request: pytest.FixtureRequest) -> type:
@@ -180,5 +204,7 @@ def _contact_service(instance: Any) -> None:
             instance.compress_documents(
                 documents=[Document(page_content="World")], query="Hello"
             )
+        elif isinstance(instance, NVIDIA):
+            instance.invoke("Hello")
 
     return _contact_service
diff --git a/libs/ai-endpoints/tests/integration_tests/test_base_url.py b/libs/ai-endpoints/tests/integration_tests/test_base_url.py
index 3c821623..7c522c92 100644
--- a/libs/ai-endpoints/tests/integration_tests/test_base_url.py
+++ b/libs/ai-endpoints/tests/integration_tests/test_base_url.py
@@ -13,6 +13,7 @@ def mock_endpoints(requests_mock: Mocker) -> None:
         "/v1/embeddings",
         "/v1/chat/completions",
         "/v1/ranking",
+        "/v1/completions",
     ]:
         requests_mock.post(
             re.compile(f".*{endpoint}"),
diff --git a/libs/ai-endpoints/tests/integration_tests/test_completions_models.py b/libs/ai-endpoints/tests/integration_tests/test_completions_models.py
new file mode 100644
index 00000000..f7d1308a
--- /dev/null
+++ b/libs/ai-endpoints/tests/integration_tests/test_completions_models.py
@@ -0,0 +1,150 @@
+# https://platform.openai.com/docs/api-reference/completions/create
+# POST https://.../v1/completions
+#  model: str -- The ID of the model to use for completion.
+#  prompt: str | Array[str] -- The prompt(s) to generate completions for.
+#  best_of: Optional[int] (default: 1) -- An integer representing the number
+#                                         of completions to generate and score.
+#                                         The API will return the best completion
+#                                         of the group.
+#  echo: Optional[bool] (default: False) -- Whether to echo the prompt in addition
+#                                           to the completion.
+#  frequency_penalty: Optional[float] (default: 0.0) -- Float that penalizes new
+#                                                       tokens. Range -2.0 to 2.0.
+#  logit_bias: Optional[Dict[str, float]] -- Dict containing token to logit bias.
+#  logprobs: Optional[int] (default: None) -- Integer representing the number of
+#                                             logprobs to return. 0 means no logprobs.
+#                                             Max value is 5.
+#  max_tokens: Optional[int] (default: 16) -- Integer representing the maximum number
+#                                             of tokens to generate.
+#  n: Optional[int] (default: 1) -- Integer representing the number of completions
+#                                   to generate.
+#  presence_penalty: Optional[float] (default: 0.0) -- Float that penalizes new tokens
+#                                                      based on whether they appear in
+#                                                      the text so far. Range -2.0 to
+#                                                      2.0.
+#  seed: Optional[int] (default: None) -- Integer seed that attempts to make the
+#                                         completions deterministic.
+#  stop: Optional[str|Array[str]] -- Token at which to stop generating completions.
+#                                    Up to 4 sequences.
+#  stream: Optional[bool] (default: False) -- Whether to stream back partial progress.
+#  stream_options: Optional[Dict["include_usage": bool]] -- Dict containing stream
+#                                                           options.
+#  suffix: Optional[str] -- Suffix to add to the completion.
+#  temperature: Optional[float] (default: 1.0) -- Sampling temperature, between 0 and 2.
+#  top_p: Optional[float] (default: 1.0) -- Alternative to temperature sampling.
+#  user: Optional[str] -- User ID to associate with the request.
+#
+# Returns:
+#  id: str -- The ID of the completion.
+#  object: str -- Always "text_completion".
+#  created: int -- Unix timestamp of when the completion was created.
+#  model: str -- The ID of the model used to generate the completion.
+#  choices: List[{"finish_reason": "stop"|"length"|"content_filter",
+#                 "index": int,
+#                 "text": str,
+#                 "logprobs": Optional[{"text_offset": array,
+#                                       "token_logprobs": array,
+#                                       "tokens": array,
+#                                       "top_logprobs": array}]}] --
+#    List of completions generated by the model.
+#  usage: {"completion_tokens": int,
+#          "prompt_tokens": int,
+#          "total_tokens": int} -- Usage statistics for the model.
+#  system_fingerprint: str -- System fingerprint of the model used to generate
+#                             the completion.
+
+
+from typing import Any, Callable, Tuple
+
+import pytest
+
+from langchain_nvidia_ai_endpoints import NVIDIA
+
+
+def invoke(llm: NVIDIA, prompt: str, **kwargs: Any) -> Tuple[str, int]:
+    return llm.invoke(prompt, **kwargs), 1
+
+
+def stream(llm: NVIDIA, prompt: str, **kwargs: Any) -> Tuple[str, int]:
+    response = ""
+    count = 0
+    for chunk in llm.stream(prompt, **kwargs):
+        response += chunk
+        count += 1
+    return response, count
+
+
+@pytest.mark.parametrize(
+    "func, count", [(invoke, 0), (stream, 1)], ids=["invoke", "stream"]
+)
+def test_basic(completions_model: str, mode: dict, func: Callable, count: int) -> None:
+    llm = NVIDIA(model=completions_model, **mode)
+    response, cnt = func(llm, "Hello, my name is")
+    assert isinstance(response, str)
+    assert cnt > count, "Should have received more chunks"
+
+
+@pytest.mark.parametrize(
+    "param, value",
+    [
+        ("frequency_penalty", 0.5),
+        ("max_tokens", 32),
+        ("presence_penalty", 0.5),
+        ("seed", 1234),
+        ("stop", "Hello"),
+        ("temperature", 0.5),
+        ("top_p", 0.5),
+    ],
+)
+@pytest.mark.parametrize("func", [invoke, stream], ids=["invoke", "stream"])
+def test_params(
+    completions_model: str, mode: dict, param: str, value: Any, func: Callable
+) -> None:
+    llm = NVIDIA(model=completions_model, **mode)
+    response, _ = func(llm, "Hello, my name is", **{param: value})
+    assert isinstance(response, str)
+
+
+@pytest.mark.parametrize(
+    "param, value",
+    [
+        ("best_of", 5),
+        ("echo", True),
+        ("logit_bias", {"hello": 1.0}),
+        ("logprobs", 2),
+        ("n", 2),
+        ("suffix", "Hello"),
+        ("user", "1234"),
+    ],
+)
+@pytest.mark.parametrize("func", [invoke, stream], ids=["invoke", "stream"])
+@pytest.mark.xfail(reason="Not consistently implemented")
+def test_params_incomplete(
+    completions_model: str, mode: dict, param: str, value: Any, func: Callable
+) -> None:
+    llm = NVIDIA(model=completions_model, **mode)
+    response, _ = func(llm, "Hello, my name is", **{param: value})
+    assert isinstance(response, str)
+
+
+def test_invoke_with_stream_true(completions_model: str, mode: dict) -> None:
+    llm = NVIDIA(model=completions_model, **mode)
+    with pytest.warns(UserWarning) as record:
+        response = llm.invoke("Hello, my name is", stream=True)
+    assert isinstance(response, str)
+    assert len(record) == 1
+    assert "stream set to true" in str(record[0].message)
+    assert "ignoring" in str(record[0].message)
+
+
+def test_stream_with_stream_false(completions_model: str, mode: dict) -> None:
+    llm = NVIDIA(model=completions_model, **mode)
+    with pytest.warns(UserWarning) as record:
+        response = next(llm.stream("Hello, my name is", stream=False))
+    assert isinstance(response, str)
+    assert len(record) == 1
+    assert "stream set to false" in str(record[0].message)
+    assert "ignoring" in str(record[0].message)
+
+
+# todo: check stream_options
diff --git a/libs/ai-endpoints/tests/integration_tests/test_register_model.py b/libs/ai-endpoints/tests/integration_tests/test_register_model.py
index 6488aee3..238f2cb5 100644
--- a/libs/ai-endpoints/tests/integration_tests/test_register_model.py
+++ b/libs/ai-endpoints/tests/integration_tests/test_register_model.py
@@ -4,6 +4,7 @@
 import pytest
 
 from langchain_nvidia_ai_endpoints import (
+    NVIDIA,
     ChatNVIDIA,
     Model,
     NVIDIAEmbeddings,
@@ -34,6 +35,11 @@
             "nv-rerank-qa-mistral-4b:1",
             "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/0bf77f50-5c35-4488-8e7a-f49bb1974af6",
         ),
+        (
+            NVIDIA,
+            "bigcode/starcoder2-7b",
+            "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/dd7b01e7-732d-4da5-8e8d-315f79165a23",
+        ),
     ],
 )
 def test_registered_model_functional(
diff --git a/libs/ai-endpoints/tests/unit_tests/conftest.py b/libs/ai-endpoints/tests/unit_tests/conftest.py
index f0790214..4288819e 100644
--- a/libs/ai-endpoints/tests/unit_tests/conftest.py
+++ b/libs/ai-endpoints/tests/unit_tests/conftest.py
@@ -3,7 +3,12 @@
 import pytest
 import requests_mock
 
-from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank
+from langchain_nvidia_ai_endpoints import (
+    NVIDIA,
+    ChatNVIDIA,
+    NVIDIAEmbeddings,
+    NVIDIARerank,
+)
 
 
 @pytest.fixture(
@@ -11,6 +16,7 @@
         ChatNVIDIA,
         NVIDIAEmbeddings,
         NVIDIARerank,
+        NVIDIA,
     ]
 )
 def public_class(request: pytest.FixtureRequest) -> type:
diff --git a/libs/ai-endpoints/tests/unit_tests/test_completions_models.py b/libs/ai-endpoints/tests/unit_tests/test_completions_models.py
new file mode 100644
index 00000000..34de4c7c
--- /dev/null
+++ b/libs/ai-endpoints/tests/unit_tests/test_completions_models.py
@@ -0,0 +1,149 @@
+import json
+from functools import reduce
+from operator import add
+from typing import Any, Callable, List
+
+import pytest
+import requests_mock
+
+from langchain_nvidia_ai_endpoints import NVIDIA
+
+
+def invoke(llm: NVIDIA, prompt: str, **kwargs: Any) -> str:
+    return llm.invoke(prompt, **kwargs)
+
+
+def stream(llm: NVIDIA, prompt: str, **kwargs: Any) -> str:
+    return reduce(add, llm.stream(prompt, **kwargs))
+
+
+mock_response = {
+    "id": "ID",
+    "object": "text_completion",
+    "created": 1234567890,
+    "model": "BOGUS",
+    "choices": [
+        {
+            "index": 0,
+            "text": "COMPLETION",
+        }
+    ],
+    "usage": {"prompt_tokens": 7, "total_tokens": 207, "completion_tokens": 200},
+}
+
+
+@pytest.fixture(scope="function")
+def mock_v1_completions_invoke(
+    requests_mock: requests_mock.Mocker,
+) -> requests_mock.Mocker:
+    requests_mock.post(
+        "https://integrate.api.nvidia.com/v1/completions",
+        json=mock_response,
+    )
+    return requests_mock
+
+
+@pytest.fixture(scope="function")
+def mock_v1_completions_stream(
+    requests_mock: requests_mock.Mocker,
+) -> requests_mock.Mocker:
+    requests_mock.post(
+        "https://integrate.api.nvidia.com/v1/completions",
+        text="\n\n".join(
+            [
+                f"data: {json.dumps(mock_response)}",
+                "data: [DONE]",
+            ]
+        ),
+    )
+    return requests_mock
+
+
+@pytest.mark.parametrize(
+    "param, value",
+    [
+        ("frequency_penalty", [0.25, 0.5, 0.75]),
+        ("max_tokens", [2, 32, 512]),
+        ("presence_penalty", [0.25, 0.5, 0.75]),
+        ("seed", [1, 1234, 4321]),
+        ("stop", ["Hello", "There", "World"]),
+        ("temperature", [0, 0.5, 1]),
+        ("top_p", [0, 0.5, 1]),
+        ("best_of", [1, 5, 10]),
+        ("echo", [True, False, True]),
+        ("logit_bias", [{"hello": 1.0}, {"there": 1.0}, {"world": 1.0}]),
+        ("logprobs", [1, 2, 3]),
+        ("n", [1, 2, 3]),
+        ("suffix", ["Hello", "There", "World"]),
+        ("user", ["Bob", "Alice", "Eve"]),
+    ],
+)
+@pytest.mark.parametrize(
+    "func, mock_name",
+    [(invoke, "mock_v1_completions_invoke"), (stream, "mock_v1_completions_stream")],
+    ids=["invoke", "stream"],
+)
+def test_params(
+    param: str,
+    value: List[Any],
+    func: Callable,
+    mock_name: str,
+    request: pytest.FixtureRequest,
+) -> None:
+    """
+    This tests the following...
+     - priority order (init -> bind -> infer)
+     - param passed to init, bind, invoke / stream
+    ...for each known Completion API param.
+    """
+
+    mock = request.getfixturevalue(mock_name)
+
+    init, bind, infer = value
+
+    llm = NVIDIA(api_key="BOGUS", **{param: init})
+    func(llm, "IGNORED")
+    request_payload = mock.last_request.json()
+    assert param in request_payload
+    assert request_payload[param] == init
+
+    bound_llm = llm.bind(**{param: bind})
+    func(bound_llm, "IGNORED")
+    request_payload = mock.last_request.json()
+    assert param in request_payload
+    assert request_payload[param] == bind
+
+    func(bound_llm, "IGNORED", **{param: infer})
+    request_payload = mock.last_request.json()
+    assert param in request_payload
+    assert request_payload[param] == infer
+
+
+@pytest.mark.parametrize(
+    "func, mock_name",
+    [(invoke, "mock_v1_completions_invoke"), (stream, "mock_v1_completions_stream")],
+    ids=["invoke", "stream"],
+)
+def test_params_unknown(
+    func: Callable,
+    mock_name: str,
+    request: pytest.FixtureRequest,
+) -> None:
+    request.getfixturevalue(mock_name)
+
+    with pytest.warns(UserWarning) as record:
+        llm = NVIDIA(api_key="BOGUS", init_unknown="INIT")
+    assert len(record) == 1
+    assert "Unrecognized, ignored arguments: {'init_unknown'}" in str(record[0].message)
+
+    with pytest.warns(UserWarning) as record:
+        func(llm, "IGNORED", arg_unknown="ARG")
+    assert len(record) == 1
+    assert "Unrecognized, ignored arguments: {'arg_unknown'}" in str(record[0].message)
+
+    bound_llm = llm.bind(bind_unknown="BIND")
+
+    with pytest.warns(UserWarning) as record:
+        func(bound_llm, "IGNORED")
+    assert len(record) == 1
+    assert "Unrecognized, ignored arguments: {'bind_unknown'}" in str(record[0].message)
diff --git a/libs/ai-endpoints/tests/unit_tests/test_imports.py b/libs/ai-endpoints/tests/unit_tests/test_imports.py
index e72c2c6c..200bbea4 100644
--- a/libs/ai-endpoints/tests/unit_tests/test_imports.py
+++ b/libs/ai-endpoints/tests/unit_tests/test_imports.py
@@ -4,6 +4,7 @@
     "ChatNVIDIA",
     "NVIDIAEmbeddings",
     "NVIDIARerank",
+    "NVIDIA",
     "register_model",
     "Model",
 ]
diff --git a/libs/ai-endpoints/tests/unit_tests/test_register_model.py b/libs/ai-endpoints/tests/unit_tests/test_register_model.py
index d42bdee5..482d40dc 100644
--- a/libs/ai-endpoints/tests/unit_tests/test_register_model.py
+++ b/libs/ai-endpoints/tests/unit_tests/test_register_model.py
@@ -3,6 +3,7 @@
 import pytest
 
 from langchain_nvidia_ai_endpoints import (
+    NVIDIA,
     ChatNVIDIA,
     Model,
     NVIDIAEmbeddings,
@@ -16,12 +17,19 @@
     [
         ("chat", "NVIDIAEmbeddings"),
         ("chat", "NVIDIARerank"),
+        ("chat", "NVIDIA"),
         ("vlm", "NVIDIAEmbeddings"),
         ("vlm", "NVIDIARerank"),
+        ("vlm", "NVIDIA"),
         ("embeddings", "ChatNVIDIA"),
         ("embeddings", "NVIDIARerank"),
+        ("embeddings", "NVIDIA"),
         ("ranking", "ChatNVIDIA"),
         ("ranking", "NVIDIAEmbeddings"),
+        ("ranking", "NVIDIA"),
+        ("completions", "ChatNVIDIA"),
+        ("completions", "NVIDIAEmbeddings"),
+        ("completions", "NVIDIARerank"),
     ],
 )
 def test_mismatched_type_client(model_type: str, client: str) -> None:
@@ -53,6 +61,7 @@ def test_registered_model_usable(public_class: type) -> None:
         "ChatNVIDIA": "chat",
         "NVIDIAEmbeddings": "embedding",
         "NVIDIARerank": "ranking",
+        "NVIDIA": "completions",
     }[public_class.__name__]
     with warnings.catch_warnings():
         warnings.simplefilter("error")
@@ -112,21 +121,38 @@ def test_registered_model_is_available() -> None:
             endpoint="BOGUS",
         )
     )
+    register_model(
+        Model(
+            id="test/completions",
+            model_type="completions",
+            client="NVIDIA",
+            endpoint="BOGUS",
+        )
+    )
     chat_models = ChatNVIDIA.get_available_models(api_key="BOGUS")
     embedding_models = NVIDIAEmbeddings.get_available_models(api_key="BOGUS")
     ranking_models = NVIDIARerank.get_available_models(api_key="BOGUS")
+    completions_models = NVIDIA.get_available_models(api_key="BOGUS")
 
     assert "test/chat" in [model.id for model in chat_models]
     assert "test/chat" not in [model.id for model in embedding_models]
     assert "test/chat" not in [model.id for model in ranking_models]
+    assert "test/chat" not in [model.id for model in completions_models]
 
     assert "test/embedding" not in [model.id for model in chat_models]
     assert "test/embedding" in [model.id for model in embedding_models]
     assert "test/embedding" not in [model.id for model in ranking_models]
+    assert "test/embedding" not in [model.id for model in completions_models]
 
     assert "test/rerank" not in [model.id for model in chat_models]
     assert "test/rerank" not in [model.id for model in embedding_models]
     assert "test/rerank" in [model.id for model in ranking_models]
+    assert "test/rerank" not in [model.id for model in completions_models]
+
+    assert "test/completions" not in [model.id for model in chat_models]
+    assert "test/completions" not in [model.id for model in embedding_models]
+    assert "test/completions" not in [model.id for model in ranking_models]
+    assert "test/completions" in [model.id for model in completions_models]
 
 
 def test_registered_model_without_client_is_not_listed(public_class: type) -> None:

From e8777338219b63b08010de885b6a130d2afe3f79 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Mon, 19 Aug 2024 08:33:02 -0400
Subject: [PATCH 02/10] fix spelling of completions and NVIDIA

---
 libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
index 94d23cd4..dabe62b5 100644
--- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
+++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
@@ -86,7 +86,7 @@ def __init__(self, **kwargs: Any):
         hosted NIM.
 
         Args:
-            model (str): The model to use for reranking.
+            model (str): The model to use for completions.
             nvidia_api_key (str): The API key to use for connecting to the hosted NIM.
             api_key (str): Alternative to nvidia_api_key.
             base_url (str): The base URL of the NIM to connect to.
@@ -142,7 +142,7 @@ def __init__(self, **kwargs: Any):
     @property
     def available_models(self) -> List[Model]:
         """
-        Get a list of available models that work with NVIDIARerank.
+        Get a list of available models that work with NVIDIA.
         """
         return self._client.get_available_models(self.__class__.__name__)
 

From 386a42b805048703d194ce27b50bd8858bb7113f Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Mon, 19 Aug 2024 14:05:29 -0400
Subject: [PATCH 03/10] trim param docs to include only known functional params

---
 libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
index dabe62b5..de58338b 100644
--- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
+++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
@@ -103,13 +103,6 @@ def __init__(self, **kwargs: Any):
         - frequency_penalty (float): The frequency penalty to apply to the completion.
         - presence_penalty (float): The presence penalty to apply to the completion.
         - seed (int): The seed to use for generating completions.
-        - best_of (int): The number of completions to generate and return the best of.
-        - echo (bool): Whether to echo the prompt in the completion.
-        - logit_bias (Dict[str, float]): The logit bias to apply to the completion.
-        - logprobs (int): The number of logprobs to return.
-        - n (int): The number of completions to generate.
-        - suffix (str): The suffix to use for generating completions.
-        - user (str): The user ID to use for generating completions.
 
         These additional arguments can also be passed with `bind()`, e.g.
         `NVIDIA().bind(max_tokens=512)`, or pass directly to `invoke()` or `stream()`,

From 6ff6bb7dc4a97d2e45cdd7b3ede316c85b474fed Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Tue, 27 Aug 2024 07:21:19 -0400
Subject: [PATCH 04/10] fix ChatNVIDA -> ChatNVIDIA

---
 libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb b/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb
index c301fb8e..681df1b3 100644
--- a/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb
+++ b/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb
@@ -10,7 +10,7 @@
     "You are currently on a page documenting the use of models as [text completion models](/docs/concepts/#llms).\n",
     "Many popular models are [chat completion models](/docs/concepts/#chat-models).\n",
     "\n",
-    "To use chat completion models, use [ChatNVIDA](/docs/integrations/chat/nvidia_ai_endpoints/) instead.\n",
+    "To use chat completion models, use [ChatNVIDIA](/docs/integrations/chat/nvidia_ai_endpoints/) instead.\n",
     ":::\n",
     "\n",
     "The `langchain-nvidia-ai-endpoints` package contains LangChain integrations building applications with models on \n",

From e3b290e73591929cd466fc94c3e64928274890d0 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Tue, 27 Aug 2024 07:27:53 -0400
Subject: [PATCH 05/10] add Completions example to README

---
 libs/ai-endpoints/README.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/libs/ai-endpoints/README.md b/libs/ai-endpoints/README.md
index 7171c9a0..68f06985 100644
--- a/libs/ai-endpoints/README.md
+++ b/libs/ai-endpoints/README.md
@@ -225,6 +225,29 @@ llm.invoke(
 )
 ```
 
+## Completions
+
+You can also work with models that support the Completions API. These models accept a `prompt` instead of `messages`.
+
+```python
+completions_llm = NVIDIA().bind(max_tokens=512)
+[model.id for model in completions_llm.get_available_models()]
+
+# [
+#   ...
+#   'bigcode/starcoder2-7b',
+#   'bigcode/starcoder2-15b',
+#   ...
+# ]
+```
+
+```python
+prompt = "# Function that does quicksort written in Rust without comments:"
+for chunk in completions_llm.stream(prompt):
+    print(chunk, end="", flush=True)
+```
+
+
 ## Embeddings
 
 You can also connect to embeddings models through this package. Below is an example:

From ac5d18a7cbb9f50796c2986dd2862f282aff3355 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Tue, 27 Aug 2024 17:00:43 -0400
Subject: [PATCH 06/10] set default model to
 nvidia/mistral-nemo-minitron-8b-base

---
 libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py | 5 +++++
 libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py      | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py
index a2992523..21affcd7 100644
--- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py
+++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py
@@ -505,6 +505,11 @@ def validate_client(cls, client: str, values: dict) -> str:
         model_type="completions",
         client="NVIDIA",
     ),
+    "nvidia/mistral-nemo-minitron-8b-base": Model(
+        id="nvidia/mistral-nemo-minitron-8b-base",
+        model_type="completions",
+        client="NVIDIA",
+    ),
 }
 
 
diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
index de58338b..0dac0956 100644
--- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
+++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
@@ -22,7 +22,7 @@ class Config:
         validate_assignment = True
 
     _client: _NVIDIAClient = PrivateAttr(_NVIDIAClient)
-    _default_model_name: str = "bigcode/starcoder2-7b"
+    _default_model_name: str = "nvidia/mistral-nemo-minitron-8b-base"
     _default_base_url: str = "https://integrate.api.nvidia.com/v1"
     base_url: str = Field(
         description="Base url for model listing and invocation",

From 4c351cd1519fae2b07a9775c72857b2cba3c7c6b Mon Sep 17 00:00:00 2001
From: Daniel Glogowski <dglogowski@nvidia.com>
Date: Tue, 27 Aug 2024 19:45:40 -0700
Subject: [PATCH 07/10] updated llm nb

---
 .../docs/llms/nvidia_ai_endpoints.ipynb       | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb b/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb
index 681df1b3..a4a41f76 100644
--- a/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb
+++ b/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb
@@ -128,6 +128,15 @@
     "print(llm.invoke(prompt))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Stream, Batch, and Async\n",
+    "\n",
+    "These models natively support streaming, and as is the case with all LangChain LLMs they expose a batch method to handle concurrent requests, as well as async methods for invoke, stream, and batch. Below are a few examples."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -196,6 +205,25 @@
     ")\n",
     "print(response)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Supported models\n",
+    "\n",
+    "Querying `available_models` will still give you all of the other models offered by your API credentials."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NVIDIA.get_available_models()\n",
+    "# llm.get_available_models()"
+   ]
   }
  ],
  "metadata": {

From a785670a0ce1681b5d122854eeb59356bece3969 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Wed, 28 Aug 2024 06:40:59 -0400
Subject: [PATCH 08/10] add _identifying_params

---
 libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py | 10 ++++++++++
 .../tests/unit_tests/test_completions_models.py        |  5 +++++
 2 files changed, 15 insertions(+)

diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
index 0dac0956..12f364a5 100644
--- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
+++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py
@@ -157,6 +157,16 @@ def _llm_type(self) -> str:
         """
         return "NVIDIA"
 
+    @property
+    def _identifying_params(self) -> Dict[str, Any]:
+        """
+        Get parameters used to help identify the LLM.
+        """
+        return {
+            "model": self.model,
+            "base_url": self.base_url,
+        }
+
     def _call(
         self,
         prompt: str,
diff --git a/libs/ai-endpoints/tests/unit_tests/test_completions_models.py b/libs/ai-endpoints/tests/unit_tests/test_completions_models.py
index 34de4c7c..24239bb7 100644
--- a/libs/ai-endpoints/tests/unit_tests/test_completions_models.py
+++ b/libs/ai-endpoints/tests/unit_tests/test_completions_models.py
@@ -147,3 +147,8 @@ def test_params_unknown(
         func(bound_llm, "IGNORED")
     assert len(record) == 1
     assert "Unrecognized, ignored arguments: {'bind_unknown'}" in str(record[0].message)
+
+
+def test_identifying_params() -> None:
+    llm = NVIDIA(api_key="BOGUS")
+    assert set(llm._identifying_params.keys()) == {"model", "base_url"}

From a42e389d6c679faff352b0241d1232e8bca7e4a0 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Wed, 28 Aug 2024 06:53:51 -0400
Subject: [PATCH 09/10] add ainvoke / astream basic tests

---
 .../test_completions_models.py                | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/libs/ai-endpoints/tests/integration_tests/test_completions_models.py b/libs/ai-endpoints/tests/integration_tests/test_completions_models.py
index f7d1308a..3fb4e724 100644
--- a/libs/ai-endpoints/tests/integration_tests/test_completions_models.py
+++ b/libs/ai-endpoints/tests/integration_tests/test_completions_models.py
@@ -74,6 +74,19 @@ def stream(llm: NVIDIA, prompt: str, **kwargs: Any) -> Tuple[str, int]:
     return response, count
 
 
+async def ainvoke(llm: NVIDIA, prompt: str, **kwargs: Any) -> Tuple[str, int]:
+    return await llm.ainvoke(prompt, **kwargs), 1
+
+
+async def astream(llm: NVIDIA, prompt: str, **kwargs: Any) -> Tuple[str, int]:
+    response = ""
+    count = 0
+    async for chunk in llm.astream(prompt, **kwargs):
+        response += chunk
+        count += 1
+    return response, count
+
+
 @pytest.mark.parametrize(
     "func, count", [(invoke, 0), (stream, 1)], ids=["invoke", "stream"]
 )
@@ -84,6 +97,16 @@ def test_basic(completions_model: str, mode: dict, func: Callable, count: int) -
     assert cnt > count, "Should have received more chunks"
 
 
+@pytest.mark.parametrize(
+    "func, count", [(ainvoke, 0), (astream, 1)], ids=["ainvoke", "astream"]
+)
+async def test_abasic(completions_model: str, mode: dict, func: Callable, count: int) -> None:
+    llm = NVIDIA(model=completions_model, **mode)
+    response, cnt = await func(llm, "Hello, my name is")
+    assert isinstance(response, str)
+    assert cnt > count, "Should have received more chunks"
+
+
 @pytest.mark.parametrize(
     "param, value",
     [

From f21f394308f5615479fc9fa383e1918dd9a41811 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Wed, 28 Aug 2024 06:57:17 -0400
Subject: [PATCH 10/10] fix lint

---
 .../tests/integration_tests/test_completions_models.py        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libs/ai-endpoints/tests/integration_tests/test_completions_models.py b/libs/ai-endpoints/tests/integration_tests/test_completions_models.py
index 3fb4e724..74a7927e 100644
--- a/libs/ai-endpoints/tests/integration_tests/test_completions_models.py
+++ b/libs/ai-endpoints/tests/integration_tests/test_completions_models.py
@@ -100,7 +100,9 @@ def test_basic(completions_model: str, mode: dict, func: Callable, count: int) -
 @pytest.mark.parametrize(
     "func, count", [(ainvoke, 0), (astream, 1)], ids=["ainvoke", "astream"]
 )
-async def test_abasic(completions_model: str, mode: dict, func: Callable, count: int) -> None:
+async def test_abasic(
+    completions_model: str, mode: dict, func: Callable, count: int
+) -> None:
     llm = NVIDIA(model=completions_model, **mode)
     response, cnt = await func(llm, "Hello, my name is")
     assert isinstance(response, str)