feat(api): add gpt-4o-audio-preview model for chat completions (#1135)

This enables audio inputs and outputs. https://platform.openai.com/docs/guides/audio
openai · Oct 17, 2024 · 17a623f · 17a623f
1 parent 3c32662
commit 17a623f
Show file tree

Hide file tree

Showing 9 changed files with 183 additions and 7 deletions.
diff --git a/.stats.yml b/.stats.yml
@@ -1,2 +1,2 @@
 configured_endpoints: 68
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai-71e58a77027c67e003fdd1b1ac8ac11557d8bfabc7666d1a827c6b1ca8ab98b5.yml
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai-8729aaa35436531ab453224af10e67f89677db8f350f0346bb3537489edea649.yml
diff --git a/api.md b/api.md
@@ -33,16 +33,20 @@ Types:
 
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletion</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionAssistantMessageParam</a></code>
+- <code><a href="./src/resources/chat/completions.ts">ChatCompletionAudio</a></code>
+- <code><a href="./src/resources/chat/completions.ts">ChatCompletionAudioParam</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionChunk</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionContentPart</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionContentPartImage</a></code>
+- <code><a href="./src/resources/chat/completions.ts">ChatCompletionContentPartInputAudio</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionContentPartRefusal</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionContentPartText</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionFunctionCallOption</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionFunctionMessageParam</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionMessage</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionMessageParam</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionMessageToolCall</a></code>
+- <code><a href="./src/resources/chat/completions.ts">ChatCompletionModality</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionNamedToolChoice</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionRole</a></code>
 - <code><a href="./src/resources/chat/completions.ts">ChatCompletionStreamOptions</a></code>

diff --git a/src/index.ts b/src/index.ts
@@ -250,16 +250,20 @@ export namespace OpenAI {
   export import ChatModel = API.ChatModel;
   export import ChatCompletion = API.ChatCompletion;
   export import ChatCompletionAssistantMessageParam = API.ChatCompletionAssistantMessageParam;
+  export import ChatCompletionAudio = API.ChatCompletionAudio;
+  export import ChatCompletionAudioParam = API.ChatCompletionAudioParam;
   export import ChatCompletionChunk = API.ChatCompletionChunk;
   export import ChatCompletionContentPart = API.ChatCompletionContentPart;
   export import ChatCompletionContentPartImage = API.ChatCompletionContentPartImage;
+  export import ChatCompletionContentPartInputAudio = API.ChatCompletionContentPartInputAudio;
   export import ChatCompletionContentPartRefusal = API.ChatCompletionContentPartRefusal;
   export import ChatCompletionContentPartText = API.ChatCompletionContentPartText;
   export import ChatCompletionFunctionCallOption = API.ChatCompletionFunctionCallOption;
   export import ChatCompletionFunctionMessageParam = API.ChatCompletionFunctionMessageParam;
   export import ChatCompletionMessage = API.ChatCompletionMessage;
   export import ChatCompletionMessageParam = API.ChatCompletionMessageParam;
   export import ChatCompletionMessageToolCall = API.ChatCompletionMessageToolCall;
+  export import ChatCompletionModality = API.ChatCompletionModality;
   export import ChatCompletionNamedToolChoice = API.ChatCompletionNamedToolChoice;
   export import ChatCompletionRole = API.ChatCompletionRole;
   export import ChatCompletionStreamOptions = API.ChatCompletionStreamOptions;

diff --git a/src/lib/AbstractChatCompletionRunner.ts b/src/lib/AbstractChatCompletionRunner.ts
@@ -105,7 +105,9 @@ export class AbstractChatCompletionRunner<
       const message = this.messages[i];
       if (isAssistantMessage(message)) {
         const { function_call, ...rest } = message;
-        const ret: ChatCompletionMessage = {
+
+        // TODO: support audio here
+        const ret: Omit<ChatCompletionMessage, 'audio'> = {
           ...rest,
           content: (message as ChatCompletionMessage).content ?? null,
           refusal: (message as ChatCompletionMessage).refusal ?? null,

diff --git a/src/resources/beta/assistants.ts b/src/resources/beta/assistants.ts
@@ -298,6 +298,11 @@ export namespace AssistantStreamEvent {
     data: ThreadsAPI.Thread;
 
     event: 'thread.created';
+
+    /**
+     * Whether to enable input audio transcription.
+     */
+    enabled?: boolean;
   }
 
   /**
@@ -1084,6 +1089,11 @@ export interface ThreadStreamEvent {
   data: ThreadsAPI.Thread;
 
   event: 'thread.created';
+
+  /**
+   * Whether to enable input audio transcription.
+   */
+  enabled?: boolean;
 }
 
 export interface AssistantCreateParams {

diff --git a/src/resources/chat/chat.ts b/src/resources/chat/chat.ts
@@ -16,7 +16,10 @@ export type ChatModel =
   | 'gpt-4o'
   | 'gpt-4o-2024-08-06'
   | 'gpt-4o-2024-05-13'
+  | 'gpt-4o-realtime-preview'
   | 'gpt-4o-realtime-preview-2024-10-01'
+  | 'gpt-4o-audio-preview'
+  | 'gpt-4o-audio-preview-2024-10-01'
   | 'chatgpt-4o-latest'
   | 'gpt-4o-mini'
   | 'gpt-4o-mini-2024-07-18'
@@ -45,16 +48,20 @@ export namespace Chat {
   export import Completions = CompletionsAPI.Completions;
   export import ChatCompletion = CompletionsAPI.ChatCompletion;
   export import ChatCompletionAssistantMessageParam = CompletionsAPI.ChatCompletionAssistantMessageParam;
+  export import ChatCompletionAudio = CompletionsAPI.ChatCompletionAudio;
+  export import ChatCompletionAudioParam = CompletionsAPI.ChatCompletionAudioParam;
   export import ChatCompletionChunk = CompletionsAPI.ChatCompletionChunk;
   export import ChatCompletionContentPart = CompletionsAPI.ChatCompletionContentPart;
   export import ChatCompletionContentPartImage = CompletionsAPI.ChatCompletionContentPartImage;
+  export import ChatCompletionContentPartInputAudio = CompletionsAPI.ChatCompletionContentPartInputAudio;
   export import ChatCompletionContentPartRefusal = CompletionsAPI.ChatCompletionContentPartRefusal;
   export import ChatCompletionContentPartText = CompletionsAPI.ChatCompletionContentPartText;
   export import ChatCompletionFunctionCallOption = CompletionsAPI.ChatCompletionFunctionCallOption;
   export import ChatCompletionFunctionMessageParam = CompletionsAPI.ChatCompletionFunctionMessageParam;
   export import ChatCompletionMessage = CompletionsAPI.ChatCompletionMessage;
   export import ChatCompletionMessageParam = CompletionsAPI.ChatCompletionMessageParam;
   export import ChatCompletionMessageToolCall = CompletionsAPI.ChatCompletionMessageToolCall;
+  export import ChatCompletionModality = CompletionsAPI.ChatCompletionModality;
   export import ChatCompletionNamedToolChoice = CompletionsAPI.ChatCompletionNamedToolChoice;
   export import ChatCompletionRole = CompletionsAPI.ChatCompletionRole;
   export import ChatCompletionStreamOptions = CompletionsAPI.ChatCompletionStreamOptions;

diff --git a/src/resources/chat/completions.ts b/src/resources/chat/completions.ts
@@ -11,7 +11,10 @@ import { Stream } from '../../streaming';
 
 export class Completions extends APIResource {
   /**
-   * Creates a model response for the given chat conversation.
+   * Creates a model response for the given chat conversation. Learn more in the
+   * [text generation](https://platform.openai.com/docs/guides/text-generation),
+   * [vision](https://platform.openai.com/docs/guides/vision), and
+   * [audio](https://platform.openai.com/docs/guides/audio) guides.
    */
   create(
     body: ChatCompletionCreateParamsNonStreaming,
@@ -138,6 +141,12 @@ export interface ChatCompletionAssistantMessageParam {
    */
   role: 'assistant';
 
+  /**
+   * Data about a previous audio response from the model.
+   * [Learn more](https://platform.openai.com/docs/guides/audio).
+   */
+  audio?: ChatCompletionAssistantMessageParam.Audio | null;
+
   /**
    * The contents of the assistant message. Required unless `tool_calls` or
    * `function_call` is specified.
@@ -168,6 +177,17 @@ export interface ChatCompletionAssistantMessageParam {
 }
 
 export namespace ChatCompletionAssistantMessageParam {
+  /**
+   * Data about a previous audio response from the model.
+   * [Learn more](https://platform.openai.com/docs/guides/audio).
+   */
+  export interface Audio {
+    /**
+     * Unique identifier for a previous audio response from the model.
+     */
+    id: string;
+  }
+
   /**
    * @deprecated: Deprecated and replaced by `tool_calls`. The name and arguments of
    * a function that should be called, as generated by the model.
@@ -188,6 +208,54 @@ export namespace ChatCompletionAssistantMessageParam {
   }
 }
 
+/**
+ * If the audio output modality is requested, this object contains data about the
+ * audio response from the model.
+ * [Learn more](https://platform.openai.com/docs/guides/audio).
+ */
+export interface ChatCompletionAudio {
+  /**
+   * Unique identifier for this audio response.
+   */
+  id: string;
+
+  /**
+   * Base64 encoded audio bytes generated by the model, in the format specified in
+   * the request.
+   */
+  data: string;
+
+  /**
+   * The Unix timestamp (in seconds) for when this audio response will no longer be
+   * accessible on the server for use in multi-turn conversations.
+   */
+  expires_at: number;
+
+  /**
+   * Transcript of the audio generated by the model.
+   */
+  transcript: string;
+}
+
+/**
+ * Parameters for audio output. Required when audio output is requested with
+ * `modalities: ["audio"]`.
+ * [Learn more](https://platform.openai.com/docs/guides/audio).
+ */
+export interface ChatCompletionAudioParam {
+  /**
+   * Specifies the output audio format. Must be one of `wav`, `mp3`, `flac`, `opus`,
+   * or `pcm16`.
+   */
+  format: 'wav' | 'mp3' | 'flac' | 'opus' | 'pcm16';
+
+  /**
+   * Specifies the voice type. Supported voices are `alloy`, `echo`, `fable`, `onyx`,
+   * `nova`, and `shimmer`.
+   */
+  voice: 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
+}
+
 /**
  * Represents a streamed chunk of a chat completion response returned by model,
  * based on the provided input.
@@ -371,8 +439,18 @@ export namespace ChatCompletionChunk {
   }
 }
 
-export type ChatCompletionContentPart = ChatCompletionContentPartText | ChatCompletionContentPartImage;
+/**
+ * Learn about
+ * [text inputs](https://platform.openai.com/docs/guides/text-generation).
+ */
+export type ChatCompletionContentPart =
+  | ChatCompletionContentPartText
+  | ChatCompletionContentPartImage
+  | ChatCompletionContentPartInputAudio;
 
+/**
+ * Learn about [image inputs](https://platform.openai.com/docs/guides/vision).
+ */
 export interface ChatCompletionContentPartImage {
   image_url: ChatCompletionContentPartImage.ImageURL;
 
@@ -397,6 +475,32 @@ export namespace ChatCompletionContentPartImage {
   }
 }
 
+/**
+ * Learn about [audio inputs](https://platform.openai.com/docs/guides/audio).
+ */
+export interface ChatCompletionContentPartInputAudio {
+  input_audio: ChatCompletionContentPartInputAudio.InputAudio;
+
+  /**
+   * The type of the content part. Always `input_audio`.
+   */
+  type: 'input_audio';
+}
+
+export namespace ChatCompletionContentPartInputAudio {
+  export interface InputAudio {
+    /**
+     * Base64 encoded audio data.
+     */
+    data: string;
+
+    /**
+     * The format of the encoded audio data. Currently supports "wav" and "mp3".
+     */
+    format: 'wav' | 'mp3';
+  }
+}
+
 export interface ChatCompletionContentPartRefusal {
   /**
    * The refusal message generated by the model.
@@ -409,6 +513,10 @@ export interface ChatCompletionContentPartRefusal {
   type: 'refusal';
 }
 
+/**
+ * Learn about
+ * [text inputs](https://platform.openai.com/docs/guides/text-generation).
+ */
 export interface ChatCompletionContentPartText {
   /**
    * The text content.
@@ -471,6 +579,13 @@ export interface ChatCompletionMessage {
    */
   role: 'assistant';
 
+  /**
+   * If the audio output modality is requested, this object contains data about the
+   * audio response from the model.
+   * [Learn more](https://platform.openai.com/docs/guides/audio).
+   */
+  audio?: ChatCompletionAudio | null;
+
   /**
    * @deprecated: Deprecated and replaced by `tool_calls`. The name and arguments of
    * a function that should be called, as generated by the model.
@@ -548,6 +663,8 @@ export namespace ChatCompletionMessageToolCall {
   }
 }
 
+export type ChatCompletionModality = 'text' | 'audio';
+
 /**
  * Specifies a tool the model should use. Use to force the model to call a specific
  * function.
@@ -743,6 +860,13 @@ export interface ChatCompletionCreateParamsBase {
    */
   model: (string & {}) | ChatAPI.ChatModel;
 
+  /**
+   * Parameters for audio output. Required when audio output is requested with
+   * `modalities: ["audio"]`.
+   * [Learn more](https://platform.openai.com/docs/guides/audio).
+   */
+  audio?: ChatCompletionAudioParam | null;
+
   /**
    * Number between -2.0 and 2.0. Positive values penalize new tokens based on their
    * existing frequency in the text so far, decreasing the model's likelihood to
@@ -812,10 +936,24 @@ export interface ChatCompletionCreateParamsBase {
 
   /**
    * Developer-defined tags and values used for filtering completions in the
-   * [dashboard](https://platform.openai.com/completions).
+   * [dashboard](https://platform.openai.com/chat-completions).
    */
   metadata?: Record<string, string> | null;
 
+  /**
+   * Output types that you would like the model to generate for this request. Most
+   * models are capable of generating text, which is the default:
+   *
+   * `["text"]`
+   *
+   * The `gpt-4o-audio-preview` model can also be used to
+   * [generate audio](https://platform.openai.com/docs/guides/audio). To request that
+   * this model generate both text and audio responses, you can use:
+   *
+   * `["text", "audio"]`
+   */
+  modalities?: Array<ChatCompletionModality> | null;
+
   /**
    * How many chat completion choices to generate for each input message. Note that
    * you will be charged based on the number of generated tokens across all of the
@@ -900,8 +1038,9 @@ export interface ChatCompletionCreateParamsBase {
   stop?: string | null | Array<string>;
 
   /**
-   * Whether or not to store the output of this completion request for traffic
-   * logging in the [dashboard](https://platform.openai.com/completions).
+   * Whether or not to store the output of this chat completion request for use in
+   * our [model distillation](https://platform.openai.com/docs/guides/distillation)
+   * or [evals](https://platform.openai.com/docs/guides/evals) products.
    */
   store?: boolean | null;
 
@@ -1049,16 +1188,20 @@ export type CompletionCreateParamsStreaming = ChatCompletionCreateParamsStreamin
 export namespace Completions {
   export import ChatCompletion = ChatCompletionsAPI.ChatCompletion;
   export import ChatCompletionAssistantMessageParam = ChatCompletionsAPI.ChatCompletionAssistantMessageParam;
+  export import ChatCompletionAudio = ChatCompletionsAPI.ChatCompletionAudio;
+  export import ChatCompletionAudioParam = ChatCompletionsAPI.ChatCompletionAudioParam;
   export import ChatCompletionChunk = ChatCompletionsAPI.ChatCompletionChunk;
   export import ChatCompletionContentPart = ChatCompletionsAPI.ChatCompletionContentPart;
   export import ChatCompletionContentPartImage = ChatCompletionsAPI.ChatCompletionContentPartImage;
+  export import ChatCompletionContentPartInputAudio = ChatCompletionsAPI.ChatCompletionContentPartInputAudio;
   export import ChatCompletionContentPartRefusal = ChatCompletionsAPI.ChatCompletionContentPartRefusal;
   export import ChatCompletionContentPartText = ChatCompletionsAPI.ChatCompletionContentPartText;
   export import ChatCompletionFunctionCallOption = ChatCompletionsAPI.ChatCompletionFunctionCallOption;
   export import ChatCompletionFunctionMessageParam = ChatCompletionsAPI.ChatCompletionFunctionMessageParam;
   export import ChatCompletionMessage = ChatCompletionsAPI.ChatCompletionMessage;
   export import ChatCompletionMessageParam = ChatCompletionsAPI.ChatCompletionMessageParam;
   export import ChatCompletionMessageToolCall = ChatCompletionsAPI.ChatCompletionMessageToolCall;
+  export import ChatCompletionModality = ChatCompletionsAPI.ChatCompletionModality;
   export import ChatCompletionNamedToolChoice = ChatCompletionsAPI.ChatCompletionNamedToolChoice;
   export import ChatCompletionRole = ChatCompletionsAPI.ChatCompletionRole;
   export import ChatCompletionStreamOptions = ChatCompletionsAPI.ChatCompletionStreamOptions;

diff --git a/src/resources/chat/index.ts b/src/resources/chat/index.ts
@@ -3,16 +3,20 @@
 export {
   ChatCompletion,
   ChatCompletionAssistantMessageParam,
+  ChatCompletionAudio,
+  ChatCompletionAudioParam,
   ChatCompletionChunk,
   ChatCompletionContentPart,
   ChatCompletionContentPartImage,
+  ChatCompletionContentPartInputAudio,
   ChatCompletionContentPartRefusal,
   ChatCompletionContentPartText,
   ChatCompletionFunctionCallOption,
   ChatCompletionFunctionMessageParam,
   ChatCompletionMessage,
   ChatCompletionMessageParam,
   ChatCompletionMessageToolCall,
+  ChatCompletionModality,
   ChatCompletionNamedToolChoice,
   ChatCompletionRole,
   ChatCompletionStreamOptions,