fixed build cffi/ctypes

tangledgroup · Jul 11, 2024 · 2525245 · 2525245
1 parent 1a55881
commit 2525245
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 99 deletions.
diff --git a/examples/demo_cffi.py b/examples/demo_cffi.py
@@ -1,24 +1,77 @@
+# import os
+# import sys
+# sys.path.append(os.path.abspath('.'))
+
 import psutil
-from llama.cffi import llama_generate, LlamaOptions
-
-
-options = LlamaOptions(
-    no_display_prompt=True,
-    threads=psutil.cpu_count(logical=False),
-    # ctx_size=8192,
-    ctx_size=4 * 4096,
-    predict=512,
-    flash_attn=True,
-    cont_batching=True,
-    simple_io=True,
-    # log_disable=True,
-    hf_repo='bartowski/Phi-3.1-mini-128k-instruct-GGUF',
-    hf_file='Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
-    # hf_file='Phi-3.1-mini-128k-instruct-IQ2_M.gguf',
-    chat_template='chatml',
-    # prompt='<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n',
-    prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
-)
-
-for chunk in llama_generate(options):
-    print(chunk, flush=True, end='')
+from llama.llama_cli_cffi import llama_generate, Model, Options
+
+from demo_models import models
+
+
+def demo1():
+    options = Options(
+        no_display_prompt=True,
+        threads=psutil.cpu_count(logical=False),
+        ctx_size=8192,
+        predict=512,
+        flash_attn=True,
+        cont_batching=True,
+        simple_io=True,
+        log_disable=True,
+        hf_repo=models[0].hf_repo,
+        hf_file=models[0].hf_file,
+        prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
+    )
+
+    for chunk in llama_generate(options):
+        print(chunk, flush=True, end='')
+
+    print()
+
+
+def demo2():
+    options = Options(
+        no_display_prompt=True,
+        threads=psutil.cpu_count(logical=False),
+        ctx_size=2048,
+        predict=-2,
+        flash_attn=True,
+        cont_batching=True,
+        simple_io=True,
+        log_disable=True,
+        hf_repo=models[1].hf_repo,
+        hf_file=models[1].hf_file,
+        prompt='<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n',
+    )
+
+    for chunk in llama_generate(options):
+        print(chunk, flush=True, end='')
+
+    print()
+
+
+def demo3():
+    options = Options(
+        no_display_prompt=True,
+        threads=psutil.cpu_count(logical=False),
+        ctx_size=2048,
+        predict=-2,
+        flash_attn=True,
+        cont_batching=True,
+        simple_io=True,
+        log_disable=True,
+        hf_repo=models[2].hf_repo,
+        hf_file=models[2].hf_file,
+        prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
+    )
+
+    for chunk in llama_generate(options):
+        print(chunk, flush=True, end='')
+
+    print()
+
+
+if __name__ == '__main__':
+    demo1()
+    demo2()
+    demo3()
diff --git a/examples/demo_ctypes.py b/examples/demo_ctypes.py
@@ -5,23 +5,7 @@
 import psutil
 from llama.llama_cli_ctypes import llama_generate, Model, Options
 
-models = [
-    Model(
-        'microsoft/Phi-3-mini-128k-instruct',
-        'bartowski/Phi-3.1-mini-128k-instruct-GGUF',
-        'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
-    ),
-    Model(
-        'Qwen/Qwen2-1.5B-Instruct',
-        'Qwen/Qwen2-1.5B-Instruct-GGUF',
-        'qwen2-1_5b-instruct-q4_k_m.gguf',
-    ),
-    Model(
-        'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
-        'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
-        'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
-    ),
-]
+from demo_models import models
 
 
 def demo1():

diff --git a/examples/demo_models.py b/examples/demo_models.py
@@ -0,0 +1,19 @@
+from llama.llama_cli_model import Model
+
+models = [
+    Model(
+        'microsoft/Phi-3-mini-128k-instruct',
+        'bartowski/Phi-3.1-mini-128k-instruct-GGUF',
+        'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
+    ),
+    Model(
+        'Qwen/Qwen2-1.5B-Instruct',
+        'Qwen/Qwen2-1.5B-Instruct-GGUF',
+        'qwen2-1_5b-instruct-q4_k_m.gguf',
+    ),
+    Model(
+        'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
+        'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
+        'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
+    ),
+]
diff --git a/llama/llama_cli_cffi.py b/llama/llama_cli_cffi.py
@@ -3,25 +3,41 @@
 import json
 import ctypes
 from queue import Queue
-from copy import deepcopy
 from typing import Iterator
 from threading import Thread
 from functools import partial
 
 from huggingface_hub import hf_hub_download
 
+from .llama_cli_model import Model
 from .llama_cli_options import Options, convert_options_to_bytes
 from ._llama_cli import lib, ffi
 
 
-FPRINTF_FUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_char_p)
-FFLUSH_FUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p)
+_LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
+_LLAMA_SHOULD_STOP_T = ctypes.CFUNCTYPE(ctypes.c_int)
 
 
+def _llama_yield_token_func(chunk: bytes, queue=None, callback=None, metadata=None):
+    chunk = chunk.decode()
+    print(chunk, flush=True, end='')
+
+
+def _llama_should_stop_func(queue=None, callback=None, metadata=None) -> int:
+    return 0
 
 
 def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
-    r = lib.llama_cli_main(argc, argv)
+    _llama_yield_token = _LLAMA_YIELD_TOKEN_T(partial(_llama_yield_token_func, queue=queue, callback=callback, metadata=metadata))
+    _llama_should_stop = _LLAMA_SHOULD_STOP_T(partial(_llama_should_stop_func, queue=queue, callback=callback, metadata=metadata))
+
+    _llama_yield_token_address = ctypes.cast(_llama_yield_token, ctypes.c_void_p).value
+    _llama_should_stop_address = ctypes.cast(_llama_should_stop, ctypes.c_void_p).value
+
+    cffi__llama_yield_token_callback = ffi.cast('void (*_llama_yield_token_t)(const char * token)', _llama_yield_token_address)
+    cffi__llama_should_stop_callback = ffi.cast('int (*_llama_should_stop_t)(void)', _llama_should_stop_address)
+
+    r = lib._llama_cli_main(argc, argv, cffi__llama_yield_token_callback, cffi__llama_should_stop_callback, 1)
     assert r == 0
 
     if queue is not None:
@@ -48,32 +64,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
     else:
         queue = Queue()
 
-    # get bos, eos, and eot from metedata
-    metadata_options = deepcopy(options)
-    metadata_options.log_disable = True
-    metadata_argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(metadata_options)
-    metadata_argv = [ffi.new('char[]', n) for n in metadata_argv]
-    metadata_argc = len(metadata_argv)
-
-    c_metadata: 'const char*' = lib.llama_get_metadata_as_json(metadata_argc, metadata_argv)
-    metadata: bytes = ffi.string(c_metadata)
-    lib.llama_free_metadata_as_json(c_metadata)
-    metadata: str = metadata.decode('utf-8')
-    metadata: dict = json.loads(metadata)
-    print(f'{metadata = }')
-
-    # intercept token generation
-    fprintf = FPRINTF_FUNC(partial(fprintf_func, queue=queue, metadata=metadata))
-    fflush = FFLUSH_FUNC(fflush_func)
-
-    fprintf_address = ctypes.cast(fprintf, ctypes.c_void_p).value
-    fflush_address = ctypes.cast(fflush, ctypes.c_void_p).value
-
-    cffi_fprintf_callback = ffi.cast('int (*func)(FILE*, const char* format, ...)', fprintf_address)
-    cffi_fflush_callback = ffi.cast('int (*func)(FILE*)', fflush_address)
-
-    lib.llama_set_fprintf(cffi_fprintf_callback)
-    lib.llama_set_fflush(cffi_fflush_callback)
+    metadata: dict = {}
 
     argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(options)
     argv = [ffi.new('char[]', n) for n in argv]

diff --git a/llama/llama_cli_ctypes.py b/llama/llama_cli_ctypes.py
@@ -2,7 +2,7 @@
 
 import os
 import json
-from ctypes import *
+import ctypes
 from queue import Queue
 from typing import Iterator
 from threading import Thread
@@ -17,13 +17,13 @@
 module_path = os.path.abspath(__file__)
 module_dir = os.path.dirname(module_path)
 llama_cli_lib_path = os.path.join(module_dir, 'llama-cli.so')
-lib = CDLL(llama_cli_lib_path)
+lib = ctypes.CDLL(llama_cli_lib_path)
 
-_LLAMA_YIELD_TOKEN_T = CFUNCTYPE(None, c_char_p)
-_LLAMA_SHOULD_STOP_T = CFUNCTYPE(c_int)
+_LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
+_LLAMA_SHOULD_STOP_T = ctypes.CFUNCTYPE(ctypes.c_int)
 
-lib._llama_cli_main.argtypes = [c_int, POINTER(c_char_p), _LLAMA_YIELD_TOKEN_T, _LLAMA_SHOULD_STOP_T, c_int]
-lib._llama_cli_main.restype = c_int
+lib._llama_cli_main.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_char_p), _LLAMA_YIELD_TOKEN_T, _LLAMA_SHOULD_STOP_T, ctypes.c_int]
+lib._llama_cli_main.restype = ctypes.c_int
 
 
 def _llama_yield_token_func(chunk: bytes, queue=None, callback=None, metadata=None):
@@ -47,7 +47,7 @@ def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
         callback(None)
 
 
-def llama_generate(options: Options, callback=None, metadata=None) -> Iterator[str] | None:
+def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
     # check hf_repo, hf_file
     if options.hf_repo and options.hf_file:
         options.model = hf_hub_download(repo_id=options.hf_repo, filename=options.hf_file)
@@ -69,7 +69,7 @@ def llama_generate(options: Options, callback=None, metadata=None) -> Iterator[s
 
     argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(options)
     argc = len(argv)
-    argv = (c_char_p * argc)(*argv)
+    argv = (ctypes.c_char_p * argc)(*argv)
 
     if callback:
         _llama_cli_main(argc, argv, queue, callback, metadata)

diff --git a/scripts/build.py b/scripts/build.py
@@ -10,39 +10,52 @@
 ffibuilder = FFI()
 
 ffibuilder.cdef('''
-    void llama_set_stdout(FILE* f);
-    void llama_set_stderr(FILE* f);
-    void llama_set_fprintf(int (*func)(FILE*, const char* format, ...));
-    void llama_set_fflush(int (*func)(FILE*));
-    const char* llama_get_metadata_as_json(int argc, char ** argv);
-    void llama_free_metadata_as_json(const char * c_output);
-    int llama_cli_main(int argc, char ** argv);
+    typedef void (*_llama_yield_token_t)(const char * token);
+    typedef int (*_llama_should_stop_t)(void);
+    int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
 ''')
 
 ffibuilder.set_source(
     '_llama_cli',
     '''
     #include <stdio.h>
     
-    void llama_set_stdout(FILE* f);
-    void llama_set_stderr(FILE* f);
-    void llama_set_fprintf(int (*func)(FILE*, const char* format, ...));
-    void llama_set_fflush(int (*func)(FILE*));
-    const char* llama_get_metadata_as_json(int argc, char ** argv);
-    void llama_free_metadata_as_json(const char * c_output);
-    int llama_cli_main(int argc, char ** argv);
+    typedef void (*_llama_yield_token_t)(const char * token);
+    typedef int (*_llama_should_stop_t)(void);
+    int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
     ''',
     libraries=['stdc++'],
-    extra_objects=['../llama.cpp/libllama-cli.a'],
+    extra_objects=['../llama.cpp/llama-cli.a'],
 )
 
 
 def build(*args, **kwargs):
-    # subprocess.run(['rm', '-rf', 'llama.cpp'], check=True)
-    # subprocess.run(['git', 'clone', 'https://github.com/ggerganov/llama.cpp.git'], check=True)
-    # subprocess.run(['patch', 'llama.cpp/examples/main/main.cpp', 'main_shared_library_1.patch'], check=True)
-    # subprocess.run(['patch', 'llama.cpp/Makefile', 'makefile_static_library_0.patch'], check=True)
+    env = os.environ.copy()
+
+    subprocess.run(['git', 'clone', 'https://github.com/ggerganov/llama.cpp.git'], check=True)
+    subprocess.run(['patch', 'llama.cpp/examples/main/main.cpp', 'main_3.patch'], check=True)
+    subprocess.run(['patch', 'llama.cpp/Makefile', 'Makefile_3.patch'], check=True)
+
+    if 'PYODIDE' in env and env['PYODIDE'] == '1':
+        env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH '
+        env['UNAME_M'] = 'wasm'
+
+    subprocess.run(['make', '-C', 'llama.cpp', '-j', 'llama-cli-shared', 'llama-cli-static', 'GGML_NO_OPENMP=1', 'GGML_NO_LLAMAFILE=1'], check=True, env=env)
+
+    # cffi
+    ffibuilder.compile(tmpdir='build', verbose=True)
+
+    # ctypes
+    for file in glob.glob('build/*.so') + glob.glob('llama.cpp/*.so'):
+        shutil.move(file, 'llama/')
+
+    for file in glob.glob('build/*.dll') + glob.glob('llama.cpp/*.dll'):
+        shutil.move(file, 'llama/')
 
+    for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'):
+        shutil.move(file, 'llama/')
+
+    '''
     # cffi
     env = os.environ.copy()
     env['CXXFLAGS'] = '-DSHARED_LIB'
@@ -73,6 +86,7 @@ def build(*args, **kwargs):
 
     for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'):
         shutil.move(file, 'llama/')
+    '''
 
 
 if __name__ == '__main__':

diff --git a/scripts/clean.py b/scripts/clean.py
@@ -3,9 +3,9 @@
 
 
 def clean():
-    files = glob.glob('llama/*.so')
+    files = glob.glob('llama/*.so') + glob.glob('llama/*.a') + glob.glob('llama/*.dylib') + glob.glob('llama/*.dll')
     subprocess.run(['rm', '-fv'] + files, check=True)
     subprocess.run(['rm', '-fr', 'build'], check=True)
     subprocess.run(['rm', '-fr', 'dist'], check=True)
-    # subprocess.run(['rm', '-fr', 'llama.cpp'], check=True)
+    subprocess.run(['rm', '-fr', 'llama.cpp'], check=True)
     subprocess.run(['rm', '-fr', 'wheelhouse'], check=True)