From 62d012447f0539a6c0d0ebb711cc3b86a1b554b1 Mon Sep 17 00:00:00 2001 From: Saaket Agashe Date: Thu, 17 Oct 2024 11:02:31 -0700 Subject: [PATCH] add api setup instructions, remove stray and deprecated Engine and Agent codes --- .DS_Store | Bin 10244 -> 0 bytes .gitignore | 3 +- README.md | 20 ++++ agent_s/MultimodalAgent.py | 164 +++---------------------------- agent_s/MultimodalEngine.py | 187 ------------------------------------ 5 files changed, 33 insertions(+), 341 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 1cc996173f8a4ddfb16a21070ddfa112004f3f6b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10244 zcmeHMUu+ab7@u!j+TDfPX@LT_0vj4>Emye%+5$Cj{l}jaQ|z@|Dd2f`yR;kjcF)_r zyJACZG$d*ej4wW@_~+GV@Iev@8hsI7#m2~kCTik~CZdnVM8DbDBkdhXd=Sv=COhBG z_h)D3H@|Q8W{wa7{RJ&bh)M_%@n(`P!{Qbfv$MPE5`0Y~O2BwRT#_VCy5!{2W|eoK z2oMMm2oMMm2oMMmxE~O}Z#KKcrCbJOfIxsifWS=z*!rO2&1B4%b6k2$2NvEI0A($z zy$hdlAK=BPzKr>Dj!Q2T-^q6m;D>_0#Q+|Tc9prKjQMhoOCC;uhZFEy27iYFUh5dA zdhP^bTn1%;K!89s0&MSIPf}!x=)_vMf49?UxtPwGhT}#eUx83lyQHpOtQVJx$Kum= zDek7+g43VMA7QmI+cwkX=T!QTp_h_$#ei+ODZ|q9UVx$L=yEh+Sejkxw+oi$c(HMA z5G7Gc(w5oTuCA`I8aWkKXM1|Kht(b1ww*d9i7Q(-_l_n{m|4qyo(}{3rGPY44C{sJ zF=xIYG<@N$|w?zZ0)`kN6xaD#u>dGN6w*f(YO`@vdCagMdKMe zoAchsi^X=QIFL3OKWduFfP7s}7fD{Kj2}wKN!nb*oEXWuhHW{A^t^*INgEx^o&6cx zI+)j)J(01^Le_GUv?-G^GX*o{>V2jeH=fp=30I$S+0&$B=iLF%al*h@rb|*^9b6umaqtykou z8usZiTVt(g^g|KfgRQr&SCk{nGLYA?)ck<9c10d@a9A|uJBB-DdF%wP9MoZ5z|iOx zSs6D|Y2Bon#M|lak(ES|V@rp9WpA^rOyK-wafptu+Z|JsCs?xwQjR;4(=F`MRln^) zv`sqGsk}Z?!j;R{?nA9NU8rY4Lz>~*o)OVFfTK9fUe~8--WXQ5tZGmmSK=bQCgRK( zR?)!K5@(z*nqq6LgHMh(q}Eu3*Cbqg){sZ=7tcO&6qlYHIZ4itH^_PN9=S+9BUi{5 zI0?_fi*N>Bg0t`{yajK=JMb=Cg3ItRd;(YC8hi=!@D2O|H{e(JU8oh72z5fE zK!s*uxv)`a6Cy&Fu#0zGC}-JWUN4uzZhWdt7Cakf$s(AyN|w;jp1t=+vRq%r(_{IH zmFrsDJGXTA?!HMQ&bu@#<9?5IVIaZla<4AXXMr2t>>$$fo zZxE60bP2CMB)5r3bXv!&9igy@sYL~^DsK)&B}^b%!>b*kE=i=oE8FFr5~dQZr<2jmj@lw2kA$kQLmPvi#q6GUWZ3#hOW`N`PX3Ej{GJ76a~ z3PZ35_Q8H+>KKee0w!P*GRV~9Fa-wW!2uVF@C?i$Tc3l|$klW3GQ0w>!Rv4y-t<{} z0Y0kW;r=oMxr9rVJj~jIgPey%JYDF$(4P3bui8huqfc)7?>`O|U(&_h3TgoY0RjO6 z0RjO6_X2?$-l)pX|A%h>|Np({TF_>IK!CtqLI7$9-_DlXOH>1IWE0$jPt@#^3(i({m+14{~zrC?{HZ5f8YQA E1u;N1BLDyZ diff --git a/.gitignore b/.gitignore index c81bd44..cf1f613 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -logs/ \ No newline at end of file +logs/ +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index e5a1d3c..b0dbc7f 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,26 @@ Install the agent_s package and dependencies pip install -e . ``` +Set your LLM API Keys and other environment variables. You can do this by adding the following lines to your .bashrc (Linux), or .zshrc (MacOS) file. We support OpenAI, Azure OpenAI, Anthropic, and vLLM models. + +1. OpenAI +``` +export OPENAI_API_KEY= +``` +2. Anthropic +``` +export ANTHROPIC_API_KEY= +``` +3. OpenAI on Azure +``` +export AZURE_OPENAI_API_BASE= +export AZURE_OPENAI_API_KEY= +``` +4. vLLM for Local Models +``` +export vLLM_ENDPOINT_URL= +``` + ### Setup Retrieval from Web using Perplexica 1. Ensure Docker is installed and running on your system. diff --git a/agent_s/MultimodalAgent.py b/agent_s/MultimodalAgent.py index a7e271b..c8b43a2 100644 --- a/agent_s/MultimodalAgent.py +++ b/agent_s/MultimodalAgent.py @@ -5,24 +5,12 @@ from agent_s.MultimodalEngine import ( LMMEngineOpenAI, LMMEngineAzureOpenAI, - LMMEngineLlava, - LMMEngineCogVLM, LMMEnginevLLM, LMMEngineAnthropic, - LMMEngineQwen, ) import base64 import re -# TODO: Import only if module exists, else ignore -# from llava.constants import ( -# IMAGE_TOKEN_INDEX, -# DEFAULT_IMAGE_TOKEN, -# DEFAULT_IM_START_TOKEN, -# DEFAULT_IM_END_TOKEN, -# IMAGE_PLACEHOLDER, -# ) - data_type_map = { "openai": {"image_url": "image_url"}, "anthropic": {"image_url": "image"}, @@ -42,12 +30,6 @@ def __init__(self, engine_params=None, system_prompt=None, engine=None): self.engine = LMMEngineAzureOpenAI(**engine_params) elif engine_type == "vllm": self.engine = LMMEnginevLLM(**engine_params) - elif engine_type == "qwen": - self.engine = LMMEngineQwen(**engine_params) - elif engine_type == "llava": - self.engine = LMMEngineLlava(**engine_params) - elif engine_type == "cogvlm": - self.engine = LMMEngineCogVLM(**engine_params) else: raise ValueError("engine_type must be either 'openai' or 'azure'") else: @@ -73,15 +55,13 @@ def encode_image(self, image_content): def reset( self, ): - if isinstance(self.engine, (LMMEngineCogVLM, LMMEngineLlava)): - self.messages = [] - else: - self.messages = [ - { - "role": "system", - "content": [{"type": "text", "text": self.system_prompt}], - } - ] + + self.messages = [ + { + "role": "system", + "content": [{"type": "text", "text": self.system_prompt}], + } + ] def add_system_prompt(self, system_prompt): self.system_prompt = system_prompt @@ -98,12 +78,6 @@ def add_system_prompt(self, system_prompt): } ) - # Don't add the system prompt if we are using llava or other hf models - if isinstance(self.engine, LMMEngineLlava) or isinstance( - self.engine, LMMEngineCogVLM - ): - self.messages = [] - def remove_message_at(self, index): """Remove a message at a given index""" if index < len(self.messages): @@ -135,80 +109,8 @@ def add_message( ): """Add a new message to the list of messages""" - # For inference from locally hosted llava based on https://github.com/haotian-liu/LLaVA/ - if isinstance(self.engine, LMMEngineLlava): - - # No system prompt so first message will be from user - if len(self.messages) == 0: - role = "user" - else: - # infer role from previous message - if self.messages[-1]["role"] == "user": - role = "assistant" - elif self.messages[-1]["role"] == "assistant": - role = "user" - - image_token_se = ( - DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN - ) - - qs = text_content - if role == "user": - if len(self.messages) == 0: - # If this is the very first user message, add the system prompt to it to dictate behavior - qs = self.system_prompt + "\n" + qs - # TODO: Add comment explaining what this next part does - if IMAGE_PLACEHOLDER in qs: - if self.engine.model.config.mm_use_im_start_end: - qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs) - else: - qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs) - else: - if self.engine.model.config.mm_use_im_start_end: - qs = image_token_se + "\n" + qs - else: - qs = DEFAULT_IMAGE_TOKEN + "\n" + qs - - message = {"role": role, "content": qs} - else: - message = {"role": role, "content": text_content} - - # Capable of handling only one image right now. TODO: make capable of handling more images - if image_content: - if self.engine.args.image_file == None: - self.engine.args.image_file = image_content - - self.messages.append(message) - - elif isinstance(self.engine, LMMEngineCogVLM): - # No system prompt so first message will be from user - if len(self.messages) == 0: - role = "user" - else: - # infer role from previous message - if self.messages[-1]["role"] == "user": - role = "assistant" - elif self.messages[-1]["role"] == "assistant": - role = "user" - - # Add message content as a new message, if this is the first message prepend with system prompt - if len(self.messages) == 0: - self.messages.append( - { - "role": role, - "content": { - "type": "text", - "text": self.system_prompt + "\n\n" + text_content, - }, - } - ) - else: - self.messages.append( - {"role": role, "content": {"type": "text", "text": text_content}} - ) - - # For API-style inference from OpenAI and AzureOpenAI - elif isinstance(self.engine, (LMMEngineOpenAI, LMMEngineAzureOpenAI)): + # API-style inference from OpenAI and AzureOpenAI + if isinstance(self.engine, (LMMEngineOpenAI, LMMEngineAzureOpenAI)): # infer role from previous message if role != "user": if self.messages[-1]["role"] == "system": @@ -299,8 +201,8 @@ def add_message( ) self.messages.append(message) - # Custom Qwen Model inference - elif isinstance(self.engine, LMMEngineQwen): + # Locally hosted vLLM model inference + elif isinstance(self.engine, LMMEnginevLLM): # infer role from previous message if role != "user": if self.messages[-1]["role"] == "system": @@ -338,50 +240,6 @@ def add_message( ) self.messages.append(message) - # Custom Llama3.2 Model inference - elif isinstance(self.engine, LMMEngineTogether): - # infer role from previous message - if role != "user": - if self.messages[-1]["role"] == "system": - role = "user" - elif self.messages[-1]["role"] == "user": - role = "assistant" - elif self.messages[-1]["role"] == "assistant": - role = "user" - - message = { - "role": role, - "content": [{"type": "text", "text": text_content}], - } - - if image_content: - # Check if image_content is a list or a single image - if isinstance(image_content, list): - # If image_content is a list of images, loop through each image - for image in image_content: - base64_image = self.encode_image(image) - message["content"].append( - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{base64_image}", - }, - } - ) - else: - # If image_content is a single image, handle it directly - base64_image = self.encode_image(image_content) - message["content"].append( - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{base64_image}", - }, - } - ) - - self.messages.append(message) - def get_response( self, user_message=None, diff --git a/agent_s/MultimodalEngine.py b/agent_s/MultimodalEngine.py index 9a43ce3..994e184 100644 --- a/agent_s/MultimodalEngine.py +++ b/agent_s/MultimodalEngine.py @@ -122,33 +122,6 @@ def generate(self, messages, temperature=0., max_new_tokens=None, **kwargs): **kwargs, ).content[0].text - -class LMMEngineQwen(LMMEngine): - def __init__(self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs): - self.model = model - self.api_key = api_key - - self.base_url = base_url or os.getenv("QWEN_ENDPOINT_URL") - if self.base_url is None: - raise ValueError("An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named vLLM_ENDPOINT_URL") - - - def generate(self, messages, temperature=0., max_new_tokens=None, **kwargs): - '''Generate the next message based on previous messages''' - - data = { - 'messages': messages, - } - - response = requests.post(self.base_url, json=data) - # Check the response - if response.status_code == 200: - return response.json()['response'][0] - else: - print(f"Qwen LLM generation failed with status code: {response.status_code}") - print("Error message:", response.text) - - class OpenAIEmbeddingEngine(LMMEngine): def __init__( self, @@ -255,163 +228,3 @@ def generate(self, messages, temperature=0., top_p=0.8, repetition_penalty=1.05, extra_body={"repetition_penalty": repetition_penalty}, ) return completion.choices[0].message.content - - -class LMMEngineLlava(LMMEngine): - def __init__(self, model_path=None, model = None, tokenizer=None, image_processor=None, context_len=None, max_new_tokens=None, rate_limit=-1, **kwargs): - - assert model_path is not None, "model path must be provided" - self.model_path = model_path - - self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit - - - self.args = type('Args', (), { - "model_path": model_path, - "model_base": None, - "model_name": get_model_name_from_path(model_path), - "query": None, - "conv_mode": None, - "image_file": None, - "sep": ",", - "temperature": 0., - "top_p":1, - "num_beams": 1, - "max_new_tokens": max_new_tokens if max_new_tokens else 2048 - })() - - if not model: - self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model( - model_path, None, self.args.model_name) - else: - self.tokenizer = tokenizer - self.model = model - self.image_processor = image_processor - self.context_len = context_len - - # Check model base type for conversation template - if "llama-2" in self.args.model_name.lower(): - self.args.conv_mode = "llava_llama_2" - elif "v1" in self.args.model_name.lower(): - self.args.conv_mode = "llava_v1" - elif "mpt" in self.args.model_name.lower(): - self.conv_mode = "mpt" - else: - self.args.conv_mode = "llava_v0" - - self.conversation = conv_templates[self.args.conv_mode].copy() - - - def generate(self, messages, image=None, temperature=0., max_new_tokens=None, **kwargs): - - # Refresh the conversation holder everytime - self.conversation = conv_templates[self.args.conv_mode].copy() - '''Generate the next message based on previous messages''' - for idx, message in enumerate(messages): - self.conversation.append_message(self.conversation.roles[idx % 2], message['content']) - - # Add the "ASSISTANT:" starter before generation - - self.conversation.append_message(self.conversation.roles[1], None) - prompt = self.conversation.get_prompt() - self.args.image_files = [self.args.image_file] - image_files = image_parser(self.args) - images = load_images(image_files) - image_sizes = [x.size for x in images] - images_tensor = process_images( - images, - self.image_processor, - self.model.config - ).to(self.model.device, dtype=torch.float16) - - - input_ids = ( - tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") - .unsqueeze(0) - .cuda() - ) - - with torch.inference_mode(): - output_ids = self.model.generate( - input_ids, - images=images_tensor, - image_sizes=image_sizes, - do_sample=True if self.args.temperature > 0 else False, - temperature=self.args.temperature, - top_p=self.args.top_p, - num_beams=self.args.num_beams, - max_new_tokens=self.args.max_new_tokens, - ) - - outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() - return outputs - -class LMMEngineCogVLM(LMMEngine): - def __init__(self, model_path=None, model = None, tokenizer=None, image_processor=None, context_len=None, max_new_tokens=None, device=None, rate_limit=-1, **kwargs): - assert model_path is not None, "model path must be provided" - self.model_path = model_path - - self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit - if device: - self.device = device - else: self.device = 'cuda' if torch.cuda.is_available() else 'cpu' - - self.torch_type = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16 - self.gen_kwargs = { - "max_new_tokens": 2048, - "pad_token_id": 128002, - } - if not model: - self.tokenizer = AutoTokenizer.from_pretrained( - model_path, - trust_remote_code=True - ) - self.model = AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype=self.torch_type, - trust_remote_code=True - ).eval().to(self.device) - else: - self.tokenizer = tokenizer - self.model = model - - self.history = None - - def generate(self, messages, image=None, temperature=0., max_new_tokens=None, **kwargs): - '''Generate the next message based on previous messages''' - if image: - image = Image.open(image).convert('RGB') - history = [] - if len(messages) > 1: - history_list = [m["content"]["text"] for m in messages[:-1]] - # Group two messages at a time add them as a tuple to history - history = list(zip(history_list[0::2], history_list[1::2])) - - if image is None: - input_by_model = self.model.build_conversation_input_ids( - self.tokenizer, - query=messages[-1]["content"]["text"], - history=history, - template_version='chat' - ) - else: - input_by_model = self.model.build_conversation_input_ids( - self.tokenizer, - query=messages[-1]["content"]["text"], - history=history, - images=[image], - template_version='chat' - ) - inputs = { - 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(self.device), - 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(self.device), - 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(self.device), - 'images': [[input_by_model['images'][0].to(self.device).to(self.torch_type)]] if image is not None else None, - } - - with torch.no_grad(): - outputs = self.model.generate(**inputs, **self.gen_kwargs) - outputs = outputs[:, inputs['input_ids'].shape[1]:] - response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) - - return respons