From b7e199ea34fe6c65ae7c4cf2eca9813831f9ef29 Mon Sep 17 00:00:00 2001 From: hsz0403 <64573397+hsz0403@users.noreply.github.com> Date: Sun, 1 Sep 2024 00:02:36 -0700 Subject: [PATCH 1/3] Support external models with vllm huggingface openai gemini and claude (#97) * support external models: vllm huggingface openai * fix prompt for internlm * support gemini claude * fix_bugs * fix_model_bugs * update_requirement * add requirements in README * fixed by suggestions * removed comments --- LeanCopilotTests/ModelAPIs.lean | 23 +++++ python/README.md | 2 +- python/external_models/__init__.py | 5 + python/external_models/claude_runner.py | 59 +++++++++++ python/external_models/external_parser.py | 81 +++++++++++++++ python/external_models/gemini_runner.py | 85 ++++++++++++++++ python/external_models/hf_runner.py | 96 ++++++++++++++++++ python/external_models/oai_runner.py | 87 +++++++++++++++++ python/external_models/vllm_runner.py | 114 ++++++++++++++++++++++ python/server.py | 28 +++++- 10 files changed, 578 insertions(+), 2 deletions(-) create mode 100644 python/external_models/__init__.py create mode 100644 python/external_models/claude_runner.py create mode 100644 python/external_models/external_parser.py create mode 100644 python/external_models/gemini_runner.py create mode 100644 python/external_models/hf_runner.py create mode 100644 python/external_models/oai_runner.py create mode 100644 python/external_models/vllm_runner.py diff --git a/LeanCopilotTests/ModelAPIs.lean b/LeanCopilotTests/ModelAPIs.lean index 652dc38..1af675a 100644 --- a/LeanCopilotTests/ModelAPIs.lean +++ b/LeanCopilotTests/ModelAPIs.lean @@ -99,4 +99,27 @@ def reproverExternalEncoder : ExternalEncoder := { -- Go to ./python and run `uvicorn server:app --port 23337` #eval encode reproverExternalEncoder "n : ℕ\n⊢ gcd n n = n" + +/-- +LLMs apis: openai, claude etc. +-/ +def gpt4 : ExternalGenerator := { + name := "gpt4" + host := "localhost" + port := 23337 +} + +#eval generate gpt4 "n : ℕ\n⊢ gcd n n = n" + +/-- +Math LLMs: InternLM, Deepseekmath etc. +-/ +def internLM : ExternalGenerator := { + name := "InternLM" + host := "localhost" + port := 23337 +} + +#eval generate internLM "n : ℕ\n⊢ gcd n n = n" + -/ diff --git a/python/README.md b/python/README.md index c8f5cd4..7933cea 100644 --- a/python/README.md +++ b/python/README.md @@ -7,7 +7,7 @@ Python Server for External Models conda create --name lean-copilot python=3.10 python numpy conda activate lean-copilot pip install torch --index-url https://download.pytorch.org/whl/cu121 # Depending on whether you have CUDA and the CUDA version; see https://pytorch.org/. -pip install fastapi uvicorn loguru transformers openai +pip install fastapi uvicorn loguru transformers openai anthropic google vllm ``` diff --git a/python/external_models/__init__.py b/python/external_models/__init__.py new file mode 100644 index 0000000..193c35a --- /dev/null +++ b/python/external_models/__init__.py @@ -0,0 +1,5 @@ +from .oai_runner import OpenAIRunner +from .hf_runner import HFTacticGenerator +from .vllm_runner import VLLMTacticGenerator +from .claude_runner import ClaudeRunner +from .gemini_runner import GeminiRunner \ No newline at end of file diff --git a/python/external_models/claude_runner.py b/python/external_models/claude_runner.py new file mode 100644 index 0000000..bf01b8e --- /dev/null +++ b/python/external_models/claude_runner.py @@ -0,0 +1,59 @@ +import torch +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) +import os +import numpy as np +try: + from anthropic import Anthropic +except ImportError as e: + pass +from .external_parser import * + + +class ClaudeRunner(Generator, Transformer): + client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY")) + + def __init__(self, **args): + self.client_kwargs: dict[str | str] = { + "model": args['model'], + "temperature": args['temperature'], + "max_tokens": args['max_tokens'], + "top_p": args['top_p'], + } + self.name = self.client_kwargs["model"] + + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + prompt = pre_process_input(self.name, input + target_prefix) + + try: + response = self.client.completions.create( + prompt=prompt, + **self.client_kwargs, + ) + content = response.completion + + except Exception as e: + raise e + + results = [(post_process_output(self.name, content),1.0)]# current claude only supports one output + return choices_dedup(results) + + +if __name__ == "__main__": + + generation_kwargs = {"model": "claude-3-opus", + "temperature": 0.9, + "max_tokens": 1024, + "top_p": 0.9, + } + + model = ClaudeRunner(**generation_kwargs) + print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/external_parser.py b/python/external_models/external_parser.py new file mode 100644 index 0000000..11235d8 --- /dev/null +++ b/python/external_models/external_parser.py @@ -0,0 +1,81 @@ +import os +import torch +import argparse +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) + + +def get_cuda_if_available(): + return torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def pre_process_input(model_name, input): + if model_name == "internlm/internlm2-math-plus-1_8b": + prompt="My LEAN 4 state is:\n```lean\n" + input + \ + "```\nPlease predict a possible tactic to help me prove the theorem." + prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n""" + elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview": + prompt = 'Here is a theorom you need to prove in Lean:\n' + \ + input+'\nNow you should suggest one line tactic in lean code:' + elif 'gemini' in model_name or "claude" in model_name: + prompt = 'Here is a theorom you need to prove in Lean:\n' + \ + input+'\nNow you should suggest one line tactic in lean code:' + else: + raise NotImplementedError(f"External model '{model_name}' not supported") + return prompt + + +def post_process_output(model_name, output): + if model_name == "internlm/internlm2-math-plus-1_8b": + result = output.split( + 'assistant')[-1].split('lean')[-1].split('```')[0].split('\n')[1] + elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview": + result = output.split('lean')[-1].split('```')[0].split('\n')[1] + elif 'gemini' in model_name or "claude" in model_name: + result = output.split('lean')[-1].split('```')[0].split('\n')[1] + else: + raise NotImplementedError(f"External model '{model_name}' not supported") + return result + + +def choices_dedup(output_list: List[tuple[str, float]]) -> List[tuple[str, float]]: + unique_data = {} + for item in output_list: + if item[0] not in unique_data or item[1] > unique_data[item[0]]: + unique_data[item[0]] = item[1] + sorted_data = sorted(unique_data.items(), key=lambda x: x[1], reverse=True) + return sorted_data + + +class Generator(ABC): + @abstractmethod + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + pass + + +class Encoder(ABC): + @abstractmethod + def encode(self, input: str) -> np.ndarray: + pass + + +class Transformer: + def cuda(self) -> None: + self.model.cuda() + + def cpu(self) -> None: + self.model.cpu() + + @property + def device(self) -> torch.device: + return self.model.device + + diff --git a/python/external_models/gemini_runner.py b/python/external_models/gemini_runner.py new file mode 100644 index 0000000..f8f555b --- /dev/null +++ b/python/external_models/gemini_runner.py @@ -0,0 +1,85 @@ +import torch +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) +import os +import numpy as np +from .external_parser import * + +try: + import google.generativeai as genai + from google.generativeai import GenerationConfig +except ImportError as e: + pass + + +class GeminiRunner(Generator, Transformer): + client = genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) + safety_settings = [ + { + "category": "HARM_CATEGORY_HARASSMENT", + "threshold": "BLOCK_NONE", + }, + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "threshold": "BLOCK_NONE", + }, + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "threshold": "BLOCK_NONE", + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "threshold": "BLOCK_NONE", + },] + def __init__(self, **args): + + self.client_kwargs: dict[str | str] = { + "model": args['model'], + "temperature": args['temperature'], + "max_tokens": args['max_tokens'], + "top_p": args['top_p'], + + } + self.name = self.client_kwargs["model"] + + self.client = genai.GenerativeModel(args['model']) + self.generation_config = GenerationConfig( + candidate_count=1, + max_output_tokens=args['max_tokens'], + temperature=args['temperature'], + top_p=args['top_p'], + ) + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + prompt = pre_process_input(self.name, input + target_prefix) + + + response = self.client.generate_content( + prompt, + generation_config=self.generation_config, + safety_settings=GeminiRunner.safety_settings, + ) + + + + results = [(post_process_output(self.name, response.text),1.0)]# current gemini only supports one output + return choices_dedup(results) + + +if __name__ == "__main__": + + generation_kwargs = {"model": 'gemini-1.0-pro', + "temperature": 0.9, + "max_tokens": 1024, + "top_p": 0.9, + } + + model = GeminiRunner(**generation_kwargs) + print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/hf_runner.py b/python/external_models/hf_runner.py new file mode 100644 index 0000000..94dadcc --- /dev/null +++ b/python/external_models/hf_runner.py @@ -0,0 +1,96 @@ +import torch +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) +import os +import numpy as np + +import openai +from openai import OpenAI +from .external_parser import * + + +class HFTacticGenerator(Generator, Transformer): + def __init__( + self, + **args + ) -> None: + self.name = args['model'] + self.tokenizer = AutoTokenizer.from_pretrained( + self.name, trust_remote_code=True) + device = args['device'] + if device == "auto": + device = get_cuda_if_available() + else: + device = torch.device(device) + logger.info(f"Loading {self.name} on {device}") + self.model = AutoModelForCausalLM.from_pretrained( + self.name, trust_remote_code=True).to(device) + + self.generation_args: dict[str | str] = { + "do_sample": args["do_sample"], + "temperature": args['temperature'], # chat default is 0.8 + "max_new_tokens": args['max_new_tokens'], + "top_p": args['top_p'], # chat default is 0.8 + # "length_penalty": args["length_penalty"], + "num_return_sequences": args['num_return_sequences'], + # "num_beams": self.num_return_sequences, + # Here if we use beam search for llms the output are not diverse(few tactics are provided). + "output_scores": args["output_scores"], + "output_logits": args["output_logits"], + "return_dict_in_generate": args["return_dict_in_generate"], + } + + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + prompt = input + target_prefix + '''prompt= 'Here is a theorom you need to prove in Lean:\n'+prompt+'\nNow you should suggest one line tactic in lean code:' + prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n""" + ''' + prompt = pre_process_input(self.name, prompt) + + self.model = self.model.eval() + + tokenized_input = self.tokenizer(prompt, return_tensors="pt") + eos_token_id = [self.tokenizer.eos_token_id, + self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]] + outputs = self.model.generate( + tokenized_input.input_ids.to(self.device), + eos_token_id=eos_token_id, + **self.generation_args + ) + response = self.tokenizer.batch_decode( + outputs['sequences'], skip_special_tokens=True) + result = [] + index = 0 + for out, score in zip(response, outputs.scores): + out = post_process_output(self.name, out) + result.append((out, score[index].exp().sum().log().cpu().item())) + index += 1 + result = choices_dedup(result) + return result + + +if __name__ == "__main__": + + generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b", + "temperature": 0.6, + "max_new_tokens": 1024, + "top_p": 0.9, + "length_penalty": 0, + "num_return_sequences": 64, + "do_sample": True, + "output_scores": True, + "output_logits": False, + "return_dict_in_generate": True, + "device": "auto", + } + model = HFTacticGenerator(**generation_kwargs) + model.cuda() + print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/oai_runner.py b/python/external_models/oai_runner.py new file mode 100644 index 0000000..b2231c6 --- /dev/null +++ b/python/external_models/oai_runner.py @@ -0,0 +1,87 @@ +import torch +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) +import os +import numpy as np +import openai +from openai import OpenAI +from .external_parser import * + + +class OpenAIRunner(Generator, Transformer): + client = OpenAI( + api_key=os.getenv("OPENAI_API_KEY"), + ) + + def __init__(self, **args): + self.client_kwargs: dict[str | str] = { + "model": args['model'], + "temperature": args['temperature'], + "max_tokens": args['max_tokens'], + "top_p": args['top_p'], + "frequency_penalty": 0, + "presence_penalty": 0, + "n": args['num_return_sequences'], + "timeout": args['openai_timeout'], + # "stop": args.stop, --> stop is only used for base models currently + } + self.name = self.client_kwargs["model"] + + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + prompt = pre_process_input(self.name, input + target_prefix) + prompt = [ + {"role": "user", "content": f"{prompt}"}, + + ] + try: + response = OpenAIRunner.client.chat.completions.create( + messages=prompt, + logprobs=True, + **self.client_kwargs, + ) + except ( + openai.APIError, + openai.RateLimitError, + openai.InternalServerError, + openai.OpenAIError, + openai.APIStatusError, + openai.APITimeoutError, + openai.InternalServerError, + openai.APIConnectionError, + ) as e: + print("Exception: ", repr(e)) + print("Consider reducing the number of parallel processes.") + return OpenAIRunner.generate(self, input, target_prefix) + except Exception as e: + print(f"Failed to run the model for {prompt}!") + print("Exception: ", repr(e)) + raise e + + results = [(post_process_output(self.name, c.message.content), np.exp(-np.mean([ + token.logprob for token in c.logprobs.content])))for c in response.choices] + return choices_dedup(results) + + +if __name__ == "__main__": + + generation_kwargs = {"model": "gpt-4-turbo-preview", + "temperature": 0.9, + "max_tokens": 1024, + "top_p": 0.9, + "frequency_penalty": 0, + "presence_penalty": 0, + "num_return_sequences": 16, + "openai_timeout": 45, + # "stop": args.stop, --> stop is only used for base models currently + } + + model = OpenAIRunner(**generation_kwargs) + print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/vllm_runner.py b/python/external_models/vllm_runner.py new file mode 100644 index 0000000..6ac5206 --- /dev/null +++ b/python/external_models/vllm_runner.py @@ -0,0 +1,114 @@ +import torch +import numpy as np +from loguru import logger +from typing import List, Tuple +from abc import ABC, abstractmethod +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + AutoModelForTextEncoding, +) +import os +import numpy as np + +try: + from vllm import LLM, SamplingParams +except ImportError as e: + print("Cannot import vllm") + pass + +from .external_parser import * + + +class VLLMTacticGenerator(Generator, Transformer): + def __init__( + self, + **args + ) -> None: + + self.name = args['model'] + self.llm = LLM( + model=self.name, + tokenizer=self.name, + tensor_parallel_size=args["tensor_parallel_size"], + # dtype=args.dtype, + enforce_eager=True, + max_model_len=4096, + disable_custom_all_reduce=False, + # enable_prefix_caching=args.enable_prefix_caching, + trust_remote_code=True, + ) + self.sampling_params = SamplingParams( + n=args['n'], + max_tokens=args['max_tokens'], + temperature=args['temperature'], + top_p=args['top_p'], + frequency_penalty=0, + presence_penalty=0, + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + self.name, trust_remote_code=True) + device = args['device'] + if device == "auto": + device = get_cuda_if_available() + else: + device = torch.device(device) + logger.info(f"Loading {self.name} on {device}") + # self.model = AutoModelForCausalLM.from_pretrained(self.name, trust_remote_code=True).to(device) + + '''self.generation_args: dict[str | str] = { + "do_sample":args["do_sample"], + "temperature": args['temperature'],#chat default is 0.8 + "max_new_tokens": args['max_new_tokens'], + "top_p": args['top_p'],#chat default is 0.8 + #"length_penalty": args["length_penalty"], + "num_return_sequences": args['num_return_sequences'], + #"num_beams": self.num_return_sequences, + ##Here if we use beam search for llms the output are not diverse(few tactics are provided). + "output_scores":args["output_scores"], + "output_logits":args["output_logits"], + "return_dict_in_generate":args["return_dict_in_generate"], + }''' + + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: + prompt = input + target_prefix + '''prompt= 'Here is a theorom you need to prove in Lean:\n'+prompt+'\nNow you should suggest one line tactic in lean code:' + prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n""" + ''' + prompt = pre_process_input(self.name, prompt) + + # self.model = self.model.eval() + + vllm_outputs = self.llm.generate(prompt, self.sampling_params) + # pdb.set_trace() + # print(vllm_outputs) + # exit() + result = [] + for output in vllm_outputs[0].outputs: # bsz=1 for now + out = output.text.split('<|im_end|>')[0] + result.append((post_process_output(self.name, out), + np.exp(output.cumulative_logprob))) + + result = choices_dedup(result) + return result + + +if __name__ == "__main__": + + generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b", + "tensor_parallel_size": 2, + "temperature": 0.6, + "max_tokens": 1024, + "top_p": 0.9, + "length_penalty": 0, + "n": 32, + "do_sample": True, + "output_scores": True, + "output_logits": False, + "return_dict_in_generate": True, + "device": "auto", + } + model = VLLMTacticGenerator(**generation_kwargs) + print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/server.py b/python/server.py index 81a4b14..84c9733 100644 --- a/python/server.py +++ b/python/server.py @@ -3,13 +3,39 @@ from pydantic import BaseModel from models import * +from external_models import * app = FastAPI() models = { # "EleutherAI/llemma_7b": DecoderOnlyTransformer( # "EleutherAI/llemma_7b", num_return_sequences=2, max_length=64, device="auto" - #), + # ), + + "gpt4": OpenAIRunner( + model="gpt-4-turbo-preview", + temperature=0.9, + max_tokens=1024, + top_p=0.9, + frequency_penalty=0, + presence_penalty=0, + num_return_sequences=16, + openai_timeout=45, + ), + "InternLM": VLLMTacticGenerator( + model="internlm/internlm2-math-plus-1_8b", + tensor_parallel_size=2, + temperature=0.6, + max_tokens=1024, + top_p=0.9, + length_penalty=0, + n=32, + do_sample=True, + output_scores=True, + output_logits=False, + return_dict_in_generate=True, + device="auto", + ), "wellecks/llmstep-mathlib4-pythia2.8b": PythiaTacticGenerator( num_return_sequences=32, max_length=1024, device="auto" ), From 157e1588102d7ecccea449bb93a4ca1701935f03 Mon Sep 17 00:00:00 2001 From: psong Date: Sun, 1 Sep 2024 00:39:35 -0700 Subject: [PATCH 2/3] chores: large scale cleanup for the external model support --- LeanCopilotTests/ModelAPIs.lean | 4 +-- python/README.md | 11 +++++-- python/external_models/claude_runner.py | 30 +++++-------------- python/external_models/external_parser.py | 11 ------- python/external_models/gemini_runner.py | 26 ++++------------ python/external_models/hf_runner.py | 19 +++--------- python/external_models/oai_runner.py | 14 ++------- python/external_models/vllm_runner.py | 36 +---------------------- 8 files changed, 29 insertions(+), 122 deletions(-) diff --git a/LeanCopilotTests/ModelAPIs.lean b/LeanCopilotTests/ModelAPIs.lean index 1af675a..ca01459 100644 --- a/LeanCopilotTests/ModelAPIs.lean +++ b/LeanCopilotTests/ModelAPIs.lean @@ -101,7 +101,7 @@ def reproverExternalEncoder : ExternalEncoder := { #eval encode reproverExternalEncoder "n : ℕ\n⊢ gcd n n = n" /-- -LLMs apis: openai, claude etc. +General-purpose LLM apis: openai, claude, etc. -/ def gpt4 : ExternalGenerator := { name := "gpt4" @@ -112,7 +112,7 @@ def gpt4 : ExternalGenerator := { #eval generate gpt4 "n : ℕ\n⊢ gcd n n = n" /-- -Math LLMs: InternLM, Deepseekmath etc. +Math LLMs: InternLM, Deepseekmath, etc. -/ def internLM : ExternalGenerator := { name := "InternLM" diff --git a/python/README.md b/python/README.md index 7933cea..a04d446 100644 --- a/python/README.md +++ b/python/README.md @@ -1,18 +1,23 @@ Python Server for External Models ================================= +This folder contains code that enables running some of the leading general-purpose or math-specific models. It is also fairly easy to adapt the existing code and run other external models you would like to bring. + ## Requirements +The setup steps are pretty simple. The script below is sufficient to run all external models already supported in this folder. If you only want to run a subset of them, you may not need all packages in the last step of pip installation. + ```bash conda create --name lean-copilot python=3.10 python numpy conda activate lean-copilot -pip install torch --index-url https://download.pytorch.org/whl/cu121 # Depending on whether you have CUDA and the CUDA version; see https://pytorch.org/. -pip install fastapi uvicorn loguru transformers openai anthropic google vllm +pip install torch --index-url https://download.pytorch.org/whl/cu121 # Depending on whether you have CUDA and, if so, your CUDA version; see https://pytorch.org/. +pip install fastapi uvicorn loguru transformers openai anthropic google.generativeai vllm ``` - ## Running the Server ```bash uvicorn server:app --port 23337 ``` + +After the server is up running, you can go to `LeanCopilotTests/ModelAPIs.lean` to try your external models out! diff --git a/python/external_models/claude_runner.py b/python/external_models/claude_runner.py index bf01b8e..d887e37 100644 --- a/python/external_models/claude_runner.py +++ b/python/external_models/claude_runner.py @@ -1,16 +1,5 @@ -import torch -import numpy as np -from loguru import logger from typing import List, Tuple -from abc import ABC, abstractmethod -from transformers import ( - AutoModelForCausalLM, - AutoModelForSeq2SeqLM, - AutoTokenizer, - AutoModelForTextEncoding, -) import os -import numpy as np try: from anthropic import Anthropic except ImportError as e: @@ -33,22 +22,17 @@ def __init__(self, **args): def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: prompt = pre_process_input(self.name, input + target_prefix) - try: - response = self.client.completions.create( - prompt=prompt, - **self.client_kwargs, - ) - content = response.completion - - except Exception as e: - raise e - - results = [(post_process_output(self.name, content),1.0)]# current claude only supports one output + response = self.client.completions.create( + prompt=prompt, + **self.client_kwargs, + ) + content = response.completion + + results = [(post_process_output(self.name, content),1.0)] # Currently Claude only supports one output. return choices_dedup(results) if __name__ == "__main__": - generation_kwargs = {"model": "claude-3-opus", "temperature": 0.9, "max_tokens": 1024, diff --git a/python/external_models/external_parser.py b/python/external_models/external_parser.py index 11235d8..7edf0bf 100644 --- a/python/external_models/external_parser.py +++ b/python/external_models/external_parser.py @@ -1,16 +1,7 @@ -import os import torch -import argparse import numpy as np -from loguru import logger from typing import List, Tuple from abc import ABC, abstractmethod -from transformers import ( - AutoModelForCausalLM, - AutoModelForSeq2SeqLM, - AutoTokenizer, - AutoModelForTextEncoding, -) def get_cuda_if_available(): @@ -77,5 +68,3 @@ def cpu(self) -> None: @property def device(self) -> torch.device: return self.model.device - - diff --git a/python/external_models/gemini_runner.py b/python/external_models/gemini_runner.py index f8f555b..0da60fc 100644 --- a/python/external_models/gemini_runner.py +++ b/python/external_models/gemini_runner.py @@ -1,23 +1,11 @@ -import torch -import numpy as np -from loguru import logger from typing import List, Tuple -from abc import ABC, abstractmethod -from transformers import ( - AutoModelForCausalLM, - AutoModelForSeq2SeqLM, - AutoTokenizer, - AutoModelForTextEncoding, -) import os -import numpy as np -from .external_parser import * - try: import google.generativeai as genai from google.generativeai import GenerationConfig except ImportError as e: pass +from .external_parser import * class GeminiRunner(Generator, Transformer): @@ -39,8 +27,8 @@ class GeminiRunner(Generator, Transformer): "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE", },] - def __init__(self, **args): - + + def __init__(self, **args): self.client_kwargs: dict[str | str] = { "model": args['model'], "temperature": args['temperature'], @@ -49,7 +37,6 @@ def __init__(self, **args): } self.name = self.client_kwargs["model"] - self.client = genai.GenerativeModel(args['model']) self.generation_config = GenerationConfig( candidate_count=1, @@ -57,24 +44,21 @@ def __init__(self, **args): temperature=args['temperature'], top_p=args['top_p'], ) + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: prompt = pre_process_input(self.name, input + target_prefix) - response = self.client.generate_content( prompt, generation_config=self.generation_config, safety_settings=GeminiRunner.safety_settings, ) - - - results = [(post_process_output(self.name, response.text),1.0)]# current gemini only supports one output + results = [(post_process_output(self.name, response.text),1.0)] # Currently Gemini only supports one output. return choices_dedup(results) if __name__ == "__main__": - generation_kwargs = {"model": 'gemini-1.0-pro', "temperature": 0.9, "max_tokens": 1024, diff --git a/python/external_models/hf_runner.py b/python/external_models/hf_runner.py index 94dadcc..fc49c06 100644 --- a/python/external_models/hf_runner.py +++ b/python/external_models/hf_runner.py @@ -1,19 +1,10 @@ import torch -import numpy as np from loguru import logger from typing import List, Tuple -from abc import ABC, abstractmethod from transformers import ( AutoModelForCausalLM, - AutoModelForSeq2SeqLM, AutoTokenizer, - AutoModelForTextEncoding, ) -import os -import numpy as np - -import openai -from openai import OpenAI from .external_parser import * @@ -36,13 +27,11 @@ def __init__( self.generation_args: dict[str | str] = { "do_sample": args["do_sample"], - "temperature": args['temperature'], # chat default is 0.8 + "temperature": args['temperature'], # chat default is 0.8. "max_new_tokens": args['max_new_tokens'], - "top_p": args['top_p'], # chat default is 0.8 - # "length_penalty": args["length_penalty"], + "top_p": args['top_p'], # chat default is 0.8. "num_return_sequences": args['num_return_sequences'], - # "num_beams": self.num_return_sequences, - # Here if we use beam search for llms the output are not diverse(few tactics are provided). + # "num_beams": self.num_return_sequences, # Here if we use beam search for llms the output are not diverse (few tactics are provided). "output_scores": args["output_scores"], "output_logits": args["output_logits"], "return_dict_in_generate": args["return_dict_in_generate"], @@ -67,6 +56,7 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float ) response = self.tokenizer.batch_decode( outputs['sequences'], skip_special_tokens=True) + result = [] index = 0 for out, score in zip(response, outputs.scores): @@ -78,7 +68,6 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float if __name__ == "__main__": - generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b", "temperature": 0.6, "max_new_tokens": 1024, diff --git a/python/external_models/oai_runner.py b/python/external_models/oai_runner.py index b2231c6..cdc21cb 100644 --- a/python/external_models/oai_runner.py +++ b/python/external_models/oai_runner.py @@ -1,14 +1,5 @@ -import torch import numpy as np -from loguru import logger from typing import List, Tuple -from abc import ABC, abstractmethod -from transformers import ( - AutoModelForCausalLM, - AutoModelForSeq2SeqLM, - AutoTokenizer, - AutoModelForTextEncoding, -) import os import numpy as np import openai @@ -31,7 +22,7 @@ def __init__(self, **args): "presence_penalty": 0, "n": args['num_return_sequences'], "timeout": args['openai_timeout'], - # "stop": args.stop, --> stop is only used for base models currently + # "stop": args.stop, # stop is only used for base models currently } self.name = self.client_kwargs["model"] @@ -71,7 +62,6 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float if __name__ == "__main__": - generation_kwargs = {"model": "gpt-4-turbo-preview", "temperature": 0.9, "max_tokens": 1024, @@ -80,7 +70,7 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float "presence_penalty": 0, "num_return_sequences": 16, "openai_timeout": 45, - # "stop": args.stop, --> stop is only used for base models currently + # "stop": args.stop, # stop is only used for base models currently } model = OpenAIRunner(**generation_kwargs) diff --git a/python/external_models/vllm_runner.py b/python/external_models/vllm_runner.py index 6ac5206..8e1233c 100644 --- a/python/external_models/vllm_runner.py +++ b/python/external_models/vllm_runner.py @@ -2,22 +2,12 @@ import numpy as np from loguru import logger from typing import List, Tuple -from abc import ABC, abstractmethod -from transformers import ( - AutoModelForCausalLM, - AutoModelForSeq2SeqLM, - AutoTokenizer, - AutoModelForTextEncoding, -) -import os -import numpy as np - +from transformers import AutoTokenizer try: from vllm import LLM, SamplingParams except ImportError as e: print("Cannot import vllm") pass - from .external_parser import * @@ -26,17 +16,14 @@ def __init__( self, **args ) -> None: - self.name = args['model'] self.llm = LLM( model=self.name, tokenizer=self.name, tensor_parallel_size=args["tensor_parallel_size"], - # dtype=args.dtype, enforce_eager=True, max_model_len=4096, disable_custom_all_reduce=False, - # enable_prefix_caching=args.enable_prefix_caching, trust_remote_code=True, ) self.sampling_params = SamplingParams( @@ -56,21 +43,6 @@ def __init__( else: device = torch.device(device) logger.info(f"Loading {self.name} on {device}") - # self.model = AutoModelForCausalLM.from_pretrained(self.name, trust_remote_code=True).to(device) - - '''self.generation_args: dict[str | str] = { - "do_sample":args["do_sample"], - "temperature": args['temperature'],#chat default is 0.8 - "max_new_tokens": args['max_new_tokens'], - "top_p": args['top_p'],#chat default is 0.8 - #"length_penalty": args["length_penalty"], - "num_return_sequences": args['num_return_sequences'], - #"num_beams": self.num_return_sequences, - ##Here if we use beam search for llms the output are not diverse(few tactics are provided). - "output_scores":args["output_scores"], - "output_logits":args["output_logits"], - "return_dict_in_generate":args["return_dict_in_generate"], - }''' def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: prompt = input + target_prefix @@ -79,12 +51,7 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float ''' prompt = pre_process_input(self.name, prompt) - # self.model = self.model.eval() - vllm_outputs = self.llm.generate(prompt, self.sampling_params) - # pdb.set_trace() - # print(vllm_outputs) - # exit() result = [] for output in vllm_outputs[0].outputs: # bsz=1 for now out = output.text.split('<|im_end|>')[0] @@ -96,7 +63,6 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float if __name__ == "__main__": - generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b", "tensor_parallel_size": 2, "temperature": 0.6, From 30a70c01bbc79b0c5d7e13bfb0e9998807a5563e Mon Sep 17 00:00:00 2001 From: psong Date: Sun, 1 Sep 2024 00:45:59 -0700 Subject: [PATCH 3/3] Format python code with black and add contribution instructions to readme --- python/README.md | 6 +++ python/external_models/__init__.py | 2 +- python/external_models/claude_runner.py | 34 +++++++------ python/external_models/external_parser.py | 37 +++++++++----- python/external_models/gemini_runner.py | 52 ++++++++++--------- python/external_models/hf_runner.py | 61 ++++++++++++----------- python/external_models/oai_runner.py | 43 +++++++++------- python/external_models/vllm_runner.py | 55 ++++++++++---------- python/server.py | 1 - 9 files changed, 163 insertions(+), 128 deletions(-) diff --git a/python/README.md b/python/README.md index a04d446..1555b3d 100644 --- a/python/README.md +++ b/python/README.md @@ -21,3 +21,9 @@ uvicorn server:app --port 23337 ``` After the server is up running, you can go to `LeanCopilotTests/ModelAPIs.lean` to try your external models out! + +## Contributions + +We welcome contributions. If you think it would beneficial to add some other external models, or if you would like to make other contributions regarding the external model support in Lean Copilot, please feel free to open a PR. The main entry point is this `python` folder as well as the `ModelAPIs.lean` file under `LeanCopilotTests`. + +We use [`black`](https://pypi.org/project/black/) to format code in this folder. diff --git a/python/external_models/__init__.py b/python/external_models/__init__.py index 193c35a..8563182 100644 --- a/python/external_models/__init__.py +++ b/python/external_models/__init__.py @@ -2,4 +2,4 @@ from .hf_runner import HFTacticGenerator from .vllm_runner import VLLMTacticGenerator from .claude_runner import ClaudeRunner -from .gemini_runner import GeminiRunner \ No newline at end of file +from .gemini_runner import GeminiRunner diff --git a/python/external_models/claude_runner.py b/python/external_models/claude_runner.py index d887e37..479df39 100644 --- a/python/external_models/claude_runner.py +++ b/python/external_models/claude_runner.py @@ -1,5 +1,6 @@ from typing import List, Tuple import os + try: from anthropic import Anthropic except ImportError as e: @@ -12,32 +13,35 @@ class ClaudeRunner(Generator, Transformer): def __init__(self, **args): self.client_kwargs: dict[str | str] = { - "model": args['model'], - "temperature": args['temperature'], - "max_tokens": args['max_tokens'], - "top_p": args['top_p'], - } + "model": args["model"], + "temperature": args["temperature"], + "max_tokens": args["max_tokens"], + "top_p": args["top_p"], + } self.name = self.client_kwargs["model"] def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: prompt = pre_process_input(self.name, input + target_prefix) - + response = self.client.completions.create( - prompt=prompt, - **self.client_kwargs, - ) + prompt=prompt, + **self.client_kwargs, + ) content = response.completion - results = [(post_process_output(self.name, content),1.0)] # Currently Claude only supports one output. + results = [ + (post_process_output(self.name, content), 1.0) + ] # Currently Claude only supports one output. return choices_dedup(results) if __name__ == "__main__": - generation_kwargs = {"model": "claude-3-opus", - "temperature": 0.9, - "max_tokens": 1024, - "top_p": 0.9, - } + generation_kwargs = { + "model": "claude-3-opus", + "temperature": 0.9, + "max_tokens": 1024, + "top_p": 0.9, + } model = ClaudeRunner(**generation_kwargs) print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/external_parser.py b/python/external_models/external_parser.py index 7edf0bf..70e364c 100644 --- a/python/external_models/external_parser.py +++ b/python/external_models/external_parser.py @@ -10,15 +10,24 @@ def get_cuda_if_available(): def pre_process_input(model_name, input): if model_name == "internlm/internlm2-math-plus-1_8b": - prompt="My LEAN 4 state is:\n```lean\n" + input + \ - "```\nPlease predict a possible tactic to help me prove the theorem." + prompt = ( + "My LEAN 4 state is:\n```lean\n" + + input + + "```\nPlease predict a possible tactic to help me prove the theorem." + ) prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n""" elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview": - prompt = 'Here is a theorom you need to prove in Lean:\n' + \ - input+'\nNow you should suggest one line tactic in lean code:' - elif 'gemini' in model_name or "claude" in model_name: - prompt = 'Here is a theorom you need to prove in Lean:\n' + \ - input+'\nNow you should suggest one line tactic in lean code:' + prompt = ( + "Here is a theorom you need to prove in Lean:\n" + + input + + "\nNow you should suggest one line tactic in lean code:" + ) + elif "gemini" in model_name or "claude" in model_name: + prompt = ( + "Here is a theorom you need to prove in Lean:\n" + + input + + "\nNow you should suggest one line tactic in lean code:" + ) else: raise NotImplementedError(f"External model '{model_name}' not supported") return prompt @@ -26,12 +35,16 @@ def pre_process_input(model_name, input): def post_process_output(model_name, output): if model_name == "internlm/internlm2-math-plus-1_8b": - result = output.split( - 'assistant')[-1].split('lean')[-1].split('```')[0].split('\n')[1] + result = ( + output.split("assistant")[-1] + .split("lean")[-1] + .split("```")[0] + .split("\n")[1] + ) elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview": - result = output.split('lean')[-1].split('```')[0].split('\n')[1] - elif 'gemini' in model_name or "claude" in model_name: - result = output.split('lean')[-1].split('```')[0].split('\n')[1] + result = output.split("lean")[-1].split("```")[0].split("\n")[1] + elif "gemini" in model_name or "claude" in model_name: + result = output.split("lean")[-1].split("```")[0].split("\n")[1] else: raise NotImplementedError(f"External model '{model_name}' not supported") return result diff --git a/python/external_models/gemini_runner.py b/python/external_models/gemini_runner.py index 0da60fc..5d65ebb 100644 --- a/python/external_models/gemini_runner.py +++ b/python/external_models/gemini_runner.py @@ -1,5 +1,6 @@ from typing import List, Tuple import os + try: import google.generativeai as genai from google.generativeai import GenerationConfig @@ -26,44 +27,47 @@ class GeminiRunner(Generator, Transformer): { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE", - },] - - def __init__(self, **args): + }, + ] + + def __init__(self, **args): self.client_kwargs: dict[str | str] = { - "model": args['model'], - "temperature": args['temperature'], - "max_tokens": args['max_tokens'], - "top_p": args['top_p'], - + "model": args["model"], + "temperature": args["temperature"], + "max_tokens": args["max_tokens"], + "top_p": args["top_p"], } self.name = self.client_kwargs["model"] - self.client = genai.GenerativeModel(args['model']) + self.client = genai.GenerativeModel(args["model"]) self.generation_config = GenerationConfig( candidate_count=1, - max_output_tokens=args['max_tokens'], - temperature=args['temperature'], - top_p=args['top_p'], + max_output_tokens=args["max_tokens"], + temperature=args["temperature"], + top_p=args["top_p"], ) - + def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]: prompt = pre_process_input(self.name, input + target_prefix) - + response = self.client.generate_content( - prompt, - generation_config=self.generation_config, - safety_settings=GeminiRunner.safety_settings, - ) + prompt, + generation_config=self.generation_config, + safety_settings=GeminiRunner.safety_settings, + ) - results = [(post_process_output(self.name, response.text),1.0)] # Currently Gemini only supports one output. + results = [ + (post_process_output(self.name, response.text), 1.0) + ] # Currently Gemini only supports one output. return choices_dedup(results) if __name__ == "__main__": - generation_kwargs = {"model": 'gemini-1.0-pro', - "temperature": 0.9, - "max_tokens": 1024, - "top_p": 0.9, - } + generation_kwargs = { + "model": "gemini-1.0-pro", + "temperature": 0.9, + "max_tokens": 1024, + "top_p": 0.9, + } model = GeminiRunner(**generation_kwargs) print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/hf_runner.py b/python/external_models/hf_runner.py index fc49c06..28e2581 100644 --- a/python/external_models/hf_runner.py +++ b/python/external_models/hf_runner.py @@ -9,28 +9,27 @@ class HFTacticGenerator(Generator, Transformer): - def __init__( - self, - **args - ) -> None: - self.name = args['model'] + def __init__(self, **args) -> None: + self.name = args["model"] self.tokenizer = AutoTokenizer.from_pretrained( - self.name, trust_remote_code=True) - device = args['device'] + self.name, trust_remote_code=True + ) + device = args["device"] if device == "auto": device = get_cuda_if_available() else: device = torch.device(device) logger.info(f"Loading {self.name} on {device}") self.model = AutoModelForCausalLM.from_pretrained( - self.name, trust_remote_code=True).to(device) + self.name, trust_remote_code=True + ).to(device) self.generation_args: dict[str | str] = { "do_sample": args["do_sample"], - "temperature": args['temperature'], # chat default is 0.8. - "max_new_tokens": args['max_new_tokens'], - "top_p": args['top_p'], # chat default is 0.8. - "num_return_sequences": args['num_return_sequences'], + "temperature": args["temperature"], # chat default is 0.8. + "max_new_tokens": args["max_new_tokens"], + "top_p": args["top_p"], # chat default is 0.8. + "num_return_sequences": args["num_return_sequences"], # "num_beams": self.num_return_sequences, # Here if we use beam search for llms the output are not diverse (few tactics are provided). "output_scores": args["output_scores"], "output_logits": args["output_logits"], @@ -47,16 +46,19 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float self.model = self.model.eval() tokenized_input = self.tokenizer(prompt, return_tensors="pt") - eos_token_id = [self.tokenizer.eos_token_id, - self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]] + eos_token_id = [ + self.tokenizer.eos_token_id, + self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0], + ] outputs = self.model.generate( tokenized_input.input_ids.to(self.device), eos_token_id=eos_token_id, - **self.generation_args + **self.generation_args, ) response = self.tokenizer.batch_decode( - outputs['sequences'], skip_special_tokens=True) - + outputs["sequences"], skip_special_tokens=True + ) + result = [] index = 0 for out, score in zip(response, outputs.scores): @@ -68,18 +70,19 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float if __name__ == "__main__": - generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b", - "temperature": 0.6, - "max_new_tokens": 1024, - "top_p": 0.9, - "length_penalty": 0, - "num_return_sequences": 64, - "do_sample": True, - "output_scores": True, - "output_logits": False, - "return_dict_in_generate": True, - "device": "auto", - } + generation_kwargs = { + "model": "internlm/internlm2-math-plus-1_8b", + "temperature": 0.6, + "max_new_tokens": 1024, + "top_p": 0.9, + "length_penalty": 0, + "num_return_sequences": 64, + "do_sample": True, + "output_scores": True, + "output_logits": False, + "return_dict_in_generate": True, + "device": "auto", + } model = HFTacticGenerator(**generation_kwargs) model.cuda() print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/oai_runner.py b/python/external_models/oai_runner.py index cdc21cb..75b6172 100644 --- a/python/external_models/oai_runner.py +++ b/python/external_models/oai_runner.py @@ -14,14 +14,14 @@ class OpenAIRunner(Generator, Transformer): def __init__(self, **args): self.client_kwargs: dict[str | str] = { - "model": args['model'], - "temperature": args['temperature'], - "max_tokens": args['max_tokens'], - "top_p": args['top_p'], + "model": args["model"], + "temperature": args["temperature"], + "max_tokens": args["max_tokens"], + "top_p": args["top_p"], "frequency_penalty": 0, "presence_penalty": 0, - "n": args['num_return_sequences'], - "timeout": args['openai_timeout'], + "n": args["num_return_sequences"], + "timeout": args["openai_timeout"], # "stop": args.stop, # stop is only used for base models currently } self.name = self.client_kwargs["model"] @@ -30,7 +30,6 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float prompt = pre_process_input(self.name, input + target_prefix) prompt = [ {"role": "user", "content": f"{prompt}"}, - ] try: response = OpenAIRunner.client.chat.completions.create( @@ -56,22 +55,28 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float print("Exception: ", repr(e)) raise e - results = [(post_process_output(self.name, c.message.content), np.exp(-np.mean([ - token.logprob for token in c.logprobs.content])))for c in response.choices] + results = [ + ( + post_process_output(self.name, c.message.content), + np.exp(-np.mean([token.logprob for token in c.logprobs.content])), + ) + for c in response.choices + ] return choices_dedup(results) if __name__ == "__main__": - generation_kwargs = {"model": "gpt-4-turbo-preview", - "temperature": 0.9, - "max_tokens": 1024, - "top_p": 0.9, - "frequency_penalty": 0, - "presence_penalty": 0, - "num_return_sequences": 16, - "openai_timeout": 45, - # "stop": args.stop, # stop is only used for base models currently - } + generation_kwargs = { + "model": "gpt-4-turbo-preview", + "temperature": 0.9, + "max_tokens": 1024, + "top_p": 0.9, + "frequency_penalty": 0, + "presence_penalty": 0, + "num_return_sequences": 16, + "openai_timeout": 45, + # "stop": args.stop, # stop is only used for base models currently + } model = OpenAIRunner(**generation_kwargs) print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/external_models/vllm_runner.py b/python/external_models/vllm_runner.py index 8e1233c..85f6d79 100644 --- a/python/external_models/vllm_runner.py +++ b/python/external_models/vllm_runner.py @@ -3,6 +3,7 @@ from loguru import logger from typing import List, Tuple from transformers import AutoTokenizer + try: from vllm import LLM, SamplingParams except ImportError as e: @@ -12,11 +13,8 @@ class VLLMTacticGenerator(Generator, Transformer): - def __init__( - self, - **args - ) -> None: - self.name = args['model'] + def __init__(self, **args) -> None: + self.name = args["model"] self.llm = LLM( model=self.name, tokenizer=self.name, @@ -27,17 +25,18 @@ def __init__( trust_remote_code=True, ) self.sampling_params = SamplingParams( - n=args['n'], - max_tokens=args['max_tokens'], - temperature=args['temperature'], - top_p=args['top_p'], + n=args["n"], + max_tokens=args["max_tokens"], + temperature=args["temperature"], + top_p=args["top_p"], frequency_penalty=0, presence_penalty=0, ) self.tokenizer = AutoTokenizer.from_pretrained( - self.name, trust_remote_code=True) - device = args['device'] + self.name, trust_remote_code=True + ) + device = args["device"] if device == "auto": device = get_cuda_if_available() else: @@ -54,27 +53,29 @@ def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float vllm_outputs = self.llm.generate(prompt, self.sampling_params) result = [] for output in vllm_outputs[0].outputs: # bsz=1 for now - out = output.text.split('<|im_end|>')[0] - result.append((post_process_output(self.name, out), - np.exp(output.cumulative_logprob))) + out = output.text.split("<|im_end|>")[0] + result.append( + (post_process_output(self.name, out), np.exp(output.cumulative_logprob)) + ) result = choices_dedup(result) return result if __name__ == "__main__": - generation_kwargs = {"model": "internlm/internlm2-math-plus-1_8b", - "tensor_parallel_size": 2, - "temperature": 0.6, - "max_tokens": 1024, - "top_p": 0.9, - "length_penalty": 0, - "n": 32, - "do_sample": True, - "output_scores": True, - "output_logits": False, - "return_dict_in_generate": True, - "device": "auto", - } + generation_kwargs = { + "model": "internlm/internlm2-math-plus-1_8b", + "tensor_parallel_size": 2, + "temperature": 0.6, + "max_tokens": 1024, + "top_p": 0.9, + "length_penalty": 0, + "n": 32, + "do_sample": True, + "output_scores": True, + "output_logits": False, + "return_dict_in_generate": True, + "device": "auto", + } model = VLLMTacticGenerator(**generation_kwargs) print(model.generate("n : ℕ\n⊢ gcd n n = n")) diff --git a/python/server.py b/python/server.py index 84c9733..6e581e5 100644 --- a/python/server.py +++ b/python/server.py @@ -11,7 +11,6 @@ # "EleutherAI/llemma_7b": DecoderOnlyTransformer( # "EleutherAI/llemma_7b", num_return_sequences=2, max_length=64, device="auto" # ), - "gpt4": OpenAIRunner( model="gpt-4-turbo-preview", temperature=0.9,