Skip to content

Commit

Permalink
fixed build cffi/ctypes
Browse files Browse the repository at this point in the history
  • Loading branch information
mtasic85 committed Jul 11, 2024
1 parent 1a55881 commit 2525245
Show file tree
Hide file tree
Showing 7 changed files with 160 additions and 99 deletions.
99 changes: 76 additions & 23 deletions examples/demo_cffi.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,77 @@
# import os
# import sys
# sys.path.append(os.path.abspath('.'))

import psutil
from llama.cffi import llama_generate, LlamaOptions


options = LlamaOptions(
no_display_prompt=True,
threads=psutil.cpu_count(logical=False),
# ctx_size=8192,
ctx_size=4 * 4096,
predict=512,
flash_attn=True,
cont_batching=True,
simple_io=True,
# log_disable=True,
hf_repo='bartowski/Phi-3.1-mini-128k-instruct-GGUF',
hf_file='Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
# hf_file='Phi-3.1-mini-128k-instruct-IQ2_M.gguf',
chat_template='chatml',
# prompt='<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n',
prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')
from llama.llama_cli_cffi import llama_generate, Model, Options

from demo_models import models


def demo1():
options = Options(
no_display_prompt=True,
threads=psutil.cpu_count(logical=False),
ctx_size=8192,
predict=512,
flash_attn=True,
cont_batching=True,
simple_io=True,
log_disable=True,
hf_repo=models[0].hf_repo,
hf_file=models[0].hf_file,
prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')

print()


def demo2():
options = Options(
no_display_prompt=True,
threads=psutil.cpu_count(logical=False),
ctx_size=2048,
predict=-2,
flash_attn=True,
cont_batching=True,
simple_io=True,
log_disable=True,
hf_repo=models[1].hf_repo,
hf_file=models[1].hf_file,
prompt='<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n',
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')

print()


def demo3():
options = Options(
no_display_prompt=True,
threads=psutil.cpu_count(logical=False),
ctx_size=2048,
predict=-2,
flash_attn=True,
cont_batching=True,
simple_io=True,
log_disable=True,
hf_repo=models[2].hf_repo,
hf_file=models[2].hf_file,
prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')

print()


if __name__ == '__main__':
demo1()
demo2()
demo3()
18 changes: 1 addition & 17 deletions examples/demo_ctypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,7 @@
import psutil
from llama.llama_cli_ctypes import llama_generate, Model, Options

models = [
Model(
'microsoft/Phi-3-mini-128k-instruct',
'bartowski/Phi-3.1-mini-128k-instruct-GGUF',
'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
),
Model(
'Qwen/Qwen2-1.5B-Instruct',
'Qwen/Qwen2-1.5B-Instruct-GGUF',
'qwen2-1_5b-instruct-q4_k_m.gguf',
),
Model(
'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
),
]
from demo_models import models


def demo1():
Expand Down
19 changes: 19 additions & 0 deletions examples/demo_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from llama.llama_cli_model import Model

models = [
Model(
'microsoft/Phi-3-mini-128k-instruct',
'bartowski/Phi-3.1-mini-128k-instruct-GGUF',
'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
),
Model(
'Qwen/Qwen2-1.5B-Instruct',
'Qwen/Qwen2-1.5B-Instruct-GGUF',
'qwen2-1_5b-instruct-q4_k_m.gguf',
),
Model(
'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
),
]
51 changes: 21 additions & 30 deletions llama/llama_cli_cffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,41 @@
import json
import ctypes
from queue import Queue
from copy import deepcopy
from typing import Iterator
from threading import Thread
from functools import partial

from huggingface_hub import hf_hub_download

from .llama_cli_model import Model
from .llama_cli_options import Options, convert_options_to_bytes
from ._llama_cli import lib, ffi


FPRINTF_FUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_char_p)
FFLUSH_FUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p)
_LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
_LLAMA_SHOULD_STOP_T = ctypes.CFUNCTYPE(ctypes.c_int)


def _llama_yield_token_func(chunk: bytes, queue=None, callback=None, metadata=None):
chunk = chunk.decode()
print(chunk, flush=True, end='')


def _llama_should_stop_func(queue=None, callback=None, metadata=None) -> int:
return 0


def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
r = lib.llama_cli_main(argc, argv)
_llama_yield_token = _LLAMA_YIELD_TOKEN_T(partial(_llama_yield_token_func, queue=queue, callback=callback, metadata=metadata))
_llama_should_stop = _LLAMA_SHOULD_STOP_T(partial(_llama_should_stop_func, queue=queue, callback=callback, metadata=metadata))

_llama_yield_token_address = ctypes.cast(_llama_yield_token, ctypes.c_void_p).value
_llama_should_stop_address = ctypes.cast(_llama_should_stop, ctypes.c_void_p).value

cffi__llama_yield_token_callback = ffi.cast('void (*_llama_yield_token_t)(const char * token)', _llama_yield_token_address)
cffi__llama_should_stop_callback = ffi.cast('int (*_llama_should_stop_t)(void)', _llama_should_stop_address)

r = lib._llama_cli_main(argc, argv, cffi__llama_yield_token_callback, cffi__llama_should_stop_callback, 1)
assert r == 0

if queue is not None:
Expand All @@ -48,32 +64,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
else:
queue = Queue()

# get bos, eos, and eot from metedata
metadata_options = deepcopy(options)
metadata_options.log_disable = True
metadata_argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(metadata_options)
metadata_argv = [ffi.new('char[]', n) for n in metadata_argv]
metadata_argc = len(metadata_argv)

c_metadata: 'const char*' = lib.llama_get_metadata_as_json(metadata_argc, metadata_argv)
metadata: bytes = ffi.string(c_metadata)
lib.llama_free_metadata_as_json(c_metadata)
metadata: str = metadata.decode('utf-8')
metadata: dict = json.loads(metadata)
print(f'{metadata = }')

# intercept token generation
fprintf = FPRINTF_FUNC(partial(fprintf_func, queue=queue, metadata=metadata))
fflush = FFLUSH_FUNC(fflush_func)

fprintf_address = ctypes.cast(fprintf, ctypes.c_void_p).value
fflush_address = ctypes.cast(fflush, ctypes.c_void_p).value

cffi_fprintf_callback = ffi.cast('int (*func)(FILE*, const char* format, ...)', fprintf_address)
cffi_fflush_callback = ffi.cast('int (*func)(FILE*)', fflush_address)

lib.llama_set_fprintf(cffi_fprintf_callback)
lib.llama_set_fflush(cffi_fflush_callback)
metadata: dict = {}

argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(options)
argv = [ffi.new('char[]', n) for n in argv]
Expand Down
16 changes: 8 additions & 8 deletions llama/llama_cli_ctypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import json
from ctypes import *
import ctypes
from queue import Queue
from typing import Iterator
from threading import Thread
Expand All @@ -17,13 +17,13 @@
module_path = os.path.abspath(__file__)
module_dir = os.path.dirname(module_path)
llama_cli_lib_path = os.path.join(module_dir, 'llama-cli.so')
lib = CDLL(llama_cli_lib_path)
lib = ctypes.CDLL(llama_cli_lib_path)

_LLAMA_YIELD_TOKEN_T = CFUNCTYPE(None, c_char_p)
_LLAMA_SHOULD_STOP_T = CFUNCTYPE(c_int)
_LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
_LLAMA_SHOULD_STOP_T = ctypes.CFUNCTYPE(ctypes.c_int)

lib._llama_cli_main.argtypes = [c_int, POINTER(c_char_p), _LLAMA_YIELD_TOKEN_T, _LLAMA_SHOULD_STOP_T, c_int]
lib._llama_cli_main.restype = c_int
lib._llama_cli_main.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_char_p), _LLAMA_YIELD_TOKEN_T, _LLAMA_SHOULD_STOP_T, ctypes.c_int]
lib._llama_cli_main.restype = ctypes.c_int


def _llama_yield_token_func(chunk: bytes, queue=None, callback=None, metadata=None):
Expand All @@ -47,7 +47,7 @@ def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
callback(None)


def llama_generate(options: Options, callback=None, metadata=None) -> Iterator[str] | None:
def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
# check hf_repo, hf_file
if options.hf_repo and options.hf_file:
options.model = hf_hub_download(repo_id=options.hf_repo, filename=options.hf_file)
Expand All @@ -69,7 +69,7 @@ def llama_generate(options: Options, callback=None, metadata=None) -> Iterator[s

argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(options)
argc = len(argv)
argv = (c_char_p * argc)(*argv)
argv = (ctypes.c_char_p * argc)(*argv)

if callback:
_llama_cli_main(argc, argv, queue, callback, metadata)
Expand Down
52 changes: 33 additions & 19 deletions scripts/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,52 @@
ffibuilder = FFI()

ffibuilder.cdef('''
void llama_set_stdout(FILE* f);
void llama_set_stderr(FILE* f);
void llama_set_fprintf(int (*func)(FILE*, const char* format, ...));
void llama_set_fflush(int (*func)(FILE*));
const char* llama_get_metadata_as_json(int argc, char ** argv);
void llama_free_metadata_as_json(const char * c_output);
int llama_cli_main(int argc, char ** argv);
typedef void (*_llama_yield_token_t)(const char * token);
typedef int (*_llama_should_stop_t)(void);
int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
''')

ffibuilder.set_source(
'_llama_cli',
'''
#include <stdio.h>
void llama_set_stdout(FILE* f);
void llama_set_stderr(FILE* f);
void llama_set_fprintf(int (*func)(FILE*, const char* format, ...));
void llama_set_fflush(int (*func)(FILE*));
const char* llama_get_metadata_as_json(int argc, char ** argv);
void llama_free_metadata_as_json(const char * c_output);
int llama_cli_main(int argc, char ** argv);
typedef void (*_llama_yield_token_t)(const char * token);
typedef int (*_llama_should_stop_t)(void);
int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
''',
libraries=['stdc++'],
extra_objects=['../llama.cpp/libllama-cli.a'],
extra_objects=['../llama.cpp/llama-cli.a'],
)


def build(*args, **kwargs):
# subprocess.run(['rm', '-rf', 'llama.cpp'], check=True)
# subprocess.run(['git', 'clone', 'https://github.com/ggerganov/llama.cpp.git'], check=True)
# subprocess.run(['patch', 'llama.cpp/examples/main/main.cpp', 'main_shared_library_1.patch'], check=True)
# subprocess.run(['patch', 'llama.cpp/Makefile', 'makefile_static_library_0.patch'], check=True)
env = os.environ.copy()

subprocess.run(['git', 'clone', 'https://github.com/ggerganov/llama.cpp.git'], check=True)
subprocess.run(['patch', 'llama.cpp/examples/main/main.cpp', 'main_3.patch'], check=True)
subprocess.run(['patch', 'llama.cpp/Makefile', 'Makefile_3.patch'], check=True)

if 'PYODIDE' in env and env['PYODIDE'] == '1':
env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH '
env['UNAME_M'] = 'wasm'

subprocess.run(['make', '-C', 'llama.cpp', '-j', 'llama-cli-shared', 'llama-cli-static', 'GGML_NO_OPENMP=1', 'GGML_NO_LLAMAFILE=1'], check=True, env=env)

# cffi
ffibuilder.compile(tmpdir='build', verbose=True)

# ctypes
for file in glob.glob('build/*.so') + glob.glob('llama.cpp/*.so'):
shutil.move(file, 'llama/')

for file in glob.glob('build/*.dll') + glob.glob('llama.cpp/*.dll'):
shutil.move(file, 'llama/')

for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'):
shutil.move(file, 'llama/')

'''
# cffi
env = os.environ.copy()
env['CXXFLAGS'] = '-DSHARED_LIB'
Expand Down Expand Up @@ -73,6 +86,7 @@ def build(*args, **kwargs):
for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'):
shutil.move(file, 'llama/')
'''


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions scripts/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@


def clean():
files = glob.glob('llama/*.so')
files = glob.glob('llama/*.so') + glob.glob('llama/*.a') + glob.glob('llama/*.dylib') + glob.glob('llama/*.dll')
subprocess.run(['rm', '-fv'] + files, check=True)
subprocess.run(['rm', '-fr', 'build'], check=True)
subprocess.run(['rm', '-fr', 'dist'], check=True)
# subprocess.run(['rm', '-fr', 'llama.cpp'], check=True)
subprocess.run(['rm', '-fr', 'llama.cpp'], check=True)
subprocess.run(['rm', '-fr', 'wheelhouse'], check=True)

0 comments on commit 2525245

Please sign in to comment.