fixed llama_generate, _llama_yield_token_func, _llama_should_stop_fun…

…c, _llama_cli_main
tangledgroup · Jul 13, 2024 · e3b9392 · e3b9392
1 parent 72506f2
commit e3b9392
Show file tree

Hide file tree

Showing 12 changed files with 510 additions and 287 deletions.
diff --git a/examples/demo_cffi.py b/examples/demo_cffi.py
@@ -8,61 +8,19 @@
 from demo_models import models
 
 
-def demo1():
-    options = Options(
-        no_display_prompt=True,
-        threads=psutil.cpu_count(logical=False),
-        ctx_size=8192,
-        predict=512,
-        flash_attn=True,
-        cont_batching=True,
-        simple_io=True,
-        log_disable=True,
-        hf_repo=models[0].hf_repo,
-        hf_file=models[0].hf_file,
-        prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
-    )
-
-    for chunk in llama_generate(options):
-        print(chunk, flush=True, end='')
-
-    print()
-
-
-def demo2():
-    options = Options(
-        no_display_prompt=True,
-        threads=psutil.cpu_count(logical=False),
-        ctx_size=2048,
-        predict=-2,
-        flash_attn=True,
-        cont_batching=True,
-        simple_io=True,
-        log_disable=True,
-        hf_repo=models[1].hf_repo,
-        hf_file=models[1].hf_file,
-        prompt='<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n',
-    )
-
-    for chunk in llama_generate(options):
-        print(chunk, flush=True, end='')
-
-    print()
-
-
-def demo3():
+def demo_model(model: Model, messages: list[dict]):
     options = Options(
         no_display_prompt=True,
         threads=psutil.cpu_count(logical=False),
         ctx_size=2048,
         predict=-2,
+        # batch_size=512, # 2048,
         flash_attn=True,
         cont_batching=True,
         simple_io=True,
         log_disable=True,
-        hf_repo=models[2].hf_repo,
-        hf_file=models[2].hf_file,
-        prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
+        model=model,
+        prompt=messages,
     )
 
     for chunk in llama_generate(options):
@@ -72,6 +30,12 @@ def demo3():
 
 
 if __name__ == '__main__':
-    demo1()
-    demo2()
-    demo3()
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {'role': 'user', 'content': 'Evaluate 1 + 2 in Python.'},
+    ]
+
+    for model in models:
+        print(f'{model = }')
+        demo_model(model, messages)
+        print('-' * 80)
diff --git a/examples/demo_ctypes.py b/examples/demo_ctypes.py
@@ -1,68 +1,26 @@
-import os
-import sys
-sys.path.append(os.path.abspath('.'))
+# import os
+# import sys
+# sys.path.append(os.path.abspath('.'))
 
 import psutil
 from llama.llama_cli_ctypes import llama_generate, Model, Options
 
 from demo_models import models
 
 
-def demo1():
-    options = Options(
-        no_display_prompt=True,
-        threads=psutil.cpu_count(logical=False),
-        ctx_size=8192,
-        predict=512,
-        flash_attn=True,
-        cont_batching=True,
-        simple_io=True,
-        log_disable=True,
-        hf_repo=models[0].hf_repo,
-        hf_file=models[0].hf_file,
-        prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
-    )
-
-    for chunk in llama_generate(options):
-        print(chunk, flush=True, end='')
-
-    print()
-
-
-def demo2():
-    options = Options(
-        no_display_prompt=True,
-        threads=psutil.cpu_count(logical=False),
-        ctx_size=2048,
-        predict=-2,
-        flash_attn=True,
-        cont_batching=True,
-        simple_io=True,
-        log_disable=True,
-        hf_repo=models[1].hf_repo,
-        hf_file=models[1].hf_file,
-        prompt='<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n',
-    )
-
-    for chunk in llama_generate(options):
-        print(chunk, flush=True, end='')
-
-    print()
-
-
-def demo3():
+def demo_model(model: Model, messages: list[dict]):
     options = Options(
         no_display_prompt=True,
         threads=psutil.cpu_count(logical=False),
         ctx_size=2048,
         predict=-2,
+        # batch_size=512, # 2048,
         flash_attn=True,
         cont_batching=True,
         simple_io=True,
         log_disable=True,
-        hf_repo=models[2].hf_repo,
-        hf_file=models[2].hf_file,
-        prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
+        model=model,
+        prompt=messages,
     )
 
     for chunk in llama_generate(options):
@@ -72,6 +30,12 @@ def demo3():
 
 
 if __name__ == '__main__':
-    demo1()
-    demo2()
-    demo3()
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {'role': 'user', 'content': 'Evaluate 1 + 2.'},
+    ]
+
+    for model in models:
+        print(f'{model = }')
+        demo_model(model, messages)
+        print('-' * 80)
diff --git a/examples/demo_models.py b/examples/demo_models.py
@@ -1,11 +1,42 @@
 from llama.model import Model
 
 models = [
+    Model(
+        '01-ai/Yi-1.5-9B-Chat-16K',
+        'mradermacher/Yi-1.5-9B-Chat-16K-i1-GGUF',
+        # 'Yi-1.5-9B-Chat-16K.i1-IQ2_M.gguf',
+        # 'Yi-1.5-9B-Chat-16K.i1-IQ3_M.gguf',
+        'Yi-1.5-9B-Chat-16K.i1-IQ4_XS.gguf',
+        # 'Yi-1.5-9B-Chat-16K.i1-Q4_K_M.gguf',
+    ),
+    Model(
+        'mistralai/Mistral-7B-Instruct-v0.3',
+        'bartowski/Mistral-7B-Instruct-v0.3-GGUF',
+        # 'Mistral-7B-Instruct-v0.3-IQ2_M.gguf',
+        # 'Mistral-7B-Instruct-v0.3-IQ3_M.gguf',
+        'Mistral-7B-Instruct-v0.3-IQ4_XS.gguf',
+        # 'Mistral-7B-Instruct-v0.3-Q4_K_M.gguf',
+    ),
     Model(
         'microsoft/Phi-3-mini-128k-instruct',
         'bartowski/Phi-3.1-mini-128k-instruct-GGUF',
         'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
     ),
+    Model(
+        'microsoft/phi-2',
+        'andrijdavid/phi-2-GGUF',
+        'ggml-model-Q4_K_M.gguf',
+    ),
+    Model(
+        'IndexTeam/Index-1.9B-Chat',
+        'IndexTeam/Index-1.9B-Chat-GGUF',
+        'ggml-model-Q4_K_M.gguf',
+    ),
+    Model(
+        'internlm/internlm2-chat-1_8b',
+        'QuantFactory/internlm2-chat-1_8b-GGUF',
+        'internlm2-chat-1_8b.Q4_K_M.gguf',
+    ),
     Model(
         'Qwen/Qwen2-1.5B-Instruct',
         'Qwen/Qwen2-1.5B-Instruct-GGUF',
@@ -16,4 +47,4 @@
         'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
         'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
     ),
-]
+]