Merge pull request #122 from lean-dojo/external

Support external models
lean-dojo · Sep 1, 2024 · 8c2e4f8 · 8c2e4f8
2 parents 5e35567 + 30a70c0
commit 8c2e4f8
Show file tree

Hide file tree

Showing 10 changed files with 522 additions and 4 deletions.
diff --git a/LeanCopilotTests/ModelAPIs.lean b/LeanCopilotTests/ModelAPIs.lean
@@ -99,4 +99,27 @@ def reproverExternalEncoder : ExternalEncoder := {
 
 -- Go to ./python and run `uvicorn server:app --port 23337`
 #eval encode reproverExternalEncoder "n : ℕ\n⊢ gcd n n = n"
+
+/--
+General-purpose LLM apis: openai, claude, etc.
+-/
+def gpt4 : ExternalGenerator := {
+  name := "gpt4"
+  host := "localhost"
+  port := 23337
+}
+
+#eval generate gpt4 "n : ℕ\n⊢ gcd n n = n"
+
+/--
+Math LLMs: InternLM, Deepseekmath, etc.
+-/
+def internLM : ExternalGenerator := {
+  name := "InternLM"
+  host := "localhost"
+  port := 23337
+}
+
+#eval generate internLM "n : ℕ\n⊢ gcd n n = n"
+
 -/
diff --git a/python/README.md b/python/README.md
@@ -1,18 +1,29 @@
 Python Server for External Models
 =================================
 
+This folder contains code that enables running some of the leading general-purpose or math-specific models. It is also fairly easy to adapt the existing code and run other external models you would like to bring.
+
 ## Requirements
 
+The setup steps are pretty simple. The script below is sufficient to run all external models already supported in this folder. If you only want to run a subset of them, you may not need all packages in the last step of pip installation.
+
 ```bash
 conda create --name lean-copilot python=3.10 python numpy
 conda activate lean-copilot
-pip install torch --index-url https://download.pytorch.org/whl/cu121  # Depending on whether you have CUDA and the CUDA version; see https://pytorch.org/.
-pip install fastapi uvicorn loguru transformers openai
+pip install torch --index-url https://download.pytorch.org/whl/cu121  # Depending on whether you have CUDA and, if so, your CUDA version; see https://pytorch.org/.
+pip install fastapi uvicorn loguru transformers openai anthropic google.generativeai vllm
 ```
 
-
 ## Running the Server
 
 ```bash
 uvicorn server:app --port 23337
 ```
+
+After the server is up running, you can go to `LeanCopilotTests/ModelAPIs.lean` to try your external models out!
+
+## Contributions
+
+We welcome contributions. If you think it would beneficial to add some other external models, or if you would like to make other contributions regarding the external model support in Lean Copilot, please feel free to open a PR. The main entry point is this `python` folder as well as the `ModelAPIs.lean` file under `LeanCopilotTests`.
+
+We use [`black`](https://pypi.org/project/black/) to format code in this folder.
diff --git a/python/external_models/__init__.py b/python/external_models/__init__.py
@@ -0,0 +1,5 @@
+from .oai_runner import OpenAIRunner
+from .hf_runner import HFTacticGenerator
+from .vllm_runner import VLLMTacticGenerator
+from .claude_runner import ClaudeRunner
+from .gemini_runner import GeminiRunner
diff --git a/python/external_models/claude_runner.py b/python/external_models/claude_runner.py
@@ -0,0 +1,47 @@
+from typing import List, Tuple
+import os
+
+try:
+    from anthropic import Anthropic
+except ImportError as e:
+    pass
+from .external_parser import *
+
+
+class ClaudeRunner(Generator, Transformer):
+    client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
+
+    def __init__(self, **args):
+        self.client_kwargs: dict[str | str] = {
+            "model": args["model"],
+            "temperature": args["temperature"],
+            "max_tokens": args["max_tokens"],
+            "top_p": args["top_p"],
+        }
+        self.name = self.client_kwargs["model"]
+
+    def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+        prompt = pre_process_input(self.name, input + target_prefix)
+
+        response = self.client.completions.create(
+            prompt=prompt,
+            **self.client_kwargs,
+        )
+        content = response.completion
+
+        results = [
+            (post_process_output(self.name, content), 1.0)
+        ]  # Currently Claude only supports one output.
+        return choices_dedup(results)
+
+
+if __name__ == "__main__":
+    generation_kwargs = {
+        "model": "claude-3-opus",
+        "temperature": 0.9,
+        "max_tokens": 1024,
+        "top_p": 0.9,
+    }
+
+    model = ClaudeRunner(**generation_kwargs)
+    print(model.generate("n : ℕ\n⊢ gcd n n = n"))
diff --git a/python/external_models/external_parser.py b/python/external_models/external_parser.py
@@ -0,0 +1,83 @@
+import torch
+import numpy as np
+from typing import List, Tuple
+from abc import ABC, abstractmethod
+
+
+def get_cuda_if_available():
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def pre_process_input(model_name, input):
+    if model_name == "internlm/internlm2-math-plus-1_8b":
+        prompt = (
+            "My LEAN 4 state is:\n```lean\n"
+            + input
+            + "```\nPlease predict a possible tactic to help me prove the theorem."
+        )
+        prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"""
+    elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview":
+        prompt = (
+            "Here is a theorom you need to prove in Lean:\n"
+            + input
+            + "\nNow you should suggest one line tactic in lean code:"
+        )
+    elif "gemini" in model_name or "claude" in model_name:
+        prompt = (
+            "Here is a theorom you need to prove in Lean:\n"
+            + input
+            + "\nNow you should suggest one line tactic in lean code:"
+        )
+    else:
+        raise NotImplementedError(f"External model '{model_name}' not supported")
+    return prompt
+
+
+def post_process_output(model_name, output):
+    if model_name == "internlm/internlm2-math-plus-1_8b":
+        result = (
+            output.split("assistant")[-1]
+            .split("lean")[-1]
+            .split("```")[0]
+            .split("\n")[1]
+        )
+    elif model_name == "gpt-3.5-turbo" or model_name == "gpt-4-turbo-preview":
+        result = output.split("lean")[-1].split("```")[0].split("\n")[1]
+    elif "gemini" in model_name or "claude" in model_name:
+        result = output.split("lean")[-1].split("```")[0].split("\n")[1]
+    else:
+        raise NotImplementedError(f"External model '{model_name}' not supported")
+    return result
+
+
+def choices_dedup(output_list: List[tuple[str, float]]) -> List[tuple[str, float]]:
+    unique_data = {}
+    for item in output_list:
+        if item[0] not in unique_data or item[1] > unique_data[item[0]]:
+            unique_data[item[0]] = item[1]
+    sorted_data = sorted(unique_data.items(), key=lambda x: x[1], reverse=True)
+    return sorted_data
+
+
+class Generator(ABC):
+    @abstractmethod
+    def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+        pass
+
+
+class Encoder(ABC):
+    @abstractmethod
+    def encode(self, input: str) -> np.ndarray:
+        pass
+
+
+class Transformer:
+    def cuda(self) -> None:
+        self.model.cuda()
+
+    def cpu(self) -> None:
+        self.model.cpu()
+
+    @property
+    def device(self) -> torch.device:
+        return self.model.device
diff --git a/python/external_models/gemini_runner.py b/python/external_models/gemini_runner.py
@@ -0,0 +1,73 @@
+from typing import List, Tuple
+import os
+
+try:
+    import google.generativeai as genai
+    from google.generativeai import GenerationConfig
+except ImportError as e:
+    pass
+from .external_parser import *
+
+
+class GeminiRunner(Generator, Transformer):
+    client = genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+    safety_settings = [
+        {
+            "category": "HARM_CATEGORY_HARASSMENT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_HATE_SPEECH",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+            "threshold": "BLOCK_NONE",
+        },
+    ]
+
+    def __init__(self, **args):
+        self.client_kwargs: dict[str | str] = {
+            "model": args["model"],
+            "temperature": args["temperature"],
+            "max_tokens": args["max_tokens"],
+            "top_p": args["top_p"],
+        }
+        self.name = self.client_kwargs["model"]
+        self.client = genai.GenerativeModel(args["model"])
+        self.generation_config = GenerationConfig(
+            candidate_count=1,
+            max_output_tokens=args["max_tokens"],
+            temperature=args["temperature"],
+            top_p=args["top_p"],
+        )
+
+    def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+        prompt = pre_process_input(self.name, input + target_prefix)
+
+        response = self.client.generate_content(
+            prompt,
+            generation_config=self.generation_config,
+            safety_settings=GeminiRunner.safety_settings,
+        )
+
+        results = [
+            (post_process_output(self.name, response.text), 1.0)
+        ]  # Currently Gemini only supports one output.
+        return choices_dedup(results)
+
+
+if __name__ == "__main__":
+    generation_kwargs = {
+        "model": "gemini-1.0-pro",
+        "temperature": 0.9,
+        "max_tokens": 1024,
+        "top_p": 0.9,
+    }
+
+    model = GeminiRunner(**generation_kwargs)
+    print(model.generate("n : ℕ\n⊢ gcd n n = n"))
diff --git a/python/external_models/hf_runner.py b/python/external_models/hf_runner.py
@@ -0,0 +1,88 @@
+import torch
+from loguru import logger
+from typing import List, Tuple
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
+from .external_parser import *
+
+
+class HFTacticGenerator(Generator, Transformer):
+    def __init__(self, **args) -> None:
+        self.name = args["model"]
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.name, trust_remote_code=True
+        )
+        device = args["device"]
+        if device == "auto":
+            device = get_cuda_if_available()
+        else:
+            device = torch.device(device)
+        logger.info(f"Loading {self.name} on {device}")
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.name, trust_remote_code=True
+        ).to(device)
+
+        self.generation_args: dict[str | str] = {
+            "do_sample": args["do_sample"],
+            "temperature": args["temperature"],  # chat default is 0.8.
+            "max_new_tokens": args["max_new_tokens"],
+            "top_p": args["top_p"],  # chat default is 0.8.
+            "num_return_sequences": args["num_return_sequences"],
+            # "num_beams": self.num_return_sequences,  # Here if we use beam search for llms the output are not diverse (few tactics are provided).
+            "output_scores": args["output_scores"],
+            "output_logits": args["output_logits"],
+            "return_dict_in_generate": args["return_dict_in_generate"],
+        }
+
+    def generate(self, input: str, target_prefix: str = "") -> List[Tuple[str, float]]:
+        prompt = input + target_prefix
+        '''prompt= 'Here is a theorom you need to prove in Lean:\n'+prompt+'\nNow you should suggest one line tactic in lean code:'
+        prompt = f"""<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"""
+        '''
+        prompt = pre_process_input(self.name, prompt)
+
+        self.model = self.model.eval()
+
+        tokenized_input = self.tokenizer(prompt, return_tensors="pt")
+        eos_token_id = [
+            self.tokenizer.eos_token_id,
+            self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0],
+        ]
+        outputs = self.model.generate(
+            tokenized_input.input_ids.to(self.device),
+            eos_token_id=eos_token_id,
+            **self.generation_args,
+        )
+        response = self.tokenizer.batch_decode(
+            outputs["sequences"], skip_special_tokens=True
+        )
+
+        result = []
+        index = 0
+        for out, score in zip(response, outputs.scores):
+            out = post_process_output(self.name, out)
+            result.append((out, score[index].exp().sum().log().cpu().item()))
+            index += 1
+        result = choices_dedup(result)
+        return result
+
+
+if __name__ == "__main__":
+    generation_kwargs = {
+        "model": "internlm/internlm2-math-plus-1_8b",
+        "temperature": 0.6,
+        "max_new_tokens": 1024,
+        "top_p": 0.9,
+        "length_penalty": 0,
+        "num_return_sequences": 64,
+        "do_sample": True,
+        "output_scores": True,
+        "output_logits": False,
+        "return_dict_in_generate": True,
+        "device": "auto",
+    }
+    model = HFTacticGenerator(**generation_kwargs)
+    model.cuda()
+    print(model.generate("n : ℕ\n⊢ gcd n n = n"))