diff --git a/expose.cpp b/expose.cpp index 6ba0badd97310..fe487c5246f70 100644 --- a/expose.cpp +++ b/expose.cpp @@ -35,7 +35,18 @@ extern "C" { std::string model = inputs.model_filename; lora_filename = inputs.lora_filename; - file_format = check_file_format(model.c_str()); + + int forceversion = inputs.forceversion; + + if(forceversion==0) + { + file_format = check_file_format(model.c_str()); + } + else + { + printf("\nWARNING: FILE FORMAT FORCED TO VER %d\nIf incorrect, loading may fail or crash.\n",forceversion); + file_format = (FileFormat)forceversion; + } //first digit is whether configured, second is platform, third is devices int parseinfo = inputs.clblast_info; diff --git a/expose.h b/expose.h index e8550cb7f58bf..dac3247c40d97 100644 --- a/expose.h +++ b/expose.h @@ -18,6 +18,7 @@ struct load_model_inputs const int clblast_info = 0; const int blasbatchsize = 512; const bool debugmode; + const int forceversion = 0; }; struct generation_inputs { diff --git a/koboldcpp.py b/koboldcpp.py index 2ac195ef975eb..54fddb313fc8f 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -24,7 +24,8 @@ class load_model_inputs(ctypes.Structure): ("unban_tokens", ctypes.c_bool), ("clblast_info", ctypes.c_int), ("blasbatchsize", ctypes.c_int), - ("debugmode", ctypes.c_bool)] + ("debugmode", ctypes.c_bool), + ("forceversion", ctypes.c_int)] class generation_inputs(ctypes.Structure): _fields_ = [("seed", ctypes.c_int), @@ -143,6 +144,7 @@ def load_model(model_filename): inputs.use_smartcontext = args.smartcontext inputs.unban_tokens = args.unbantokens inputs.blasbatchsize = args.blasbatchsize + inputs.forceversion = args.forceversion clblastids = 0 if args.useclblast: clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1]) @@ -601,9 +603,6 @@ def main(args): parser.add_argument("--host", help="Host IP to listen on. If empty, all routable interfaces are accepted.", default="") parser.add_argument("--launch", help="Launches a web browser when load is completed.", action='store_true') parser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", default="") - - #os.environ["OMP_NUM_THREADS"] = '12' - # psutil.cpu_count(logical=False) physical_core_limit = 1 if os.cpu_count()!=None and os.cpu_count()>1: physical_core_limit = int(os.cpu_count()/2) @@ -616,6 +615,7 @@ def main(args): parser.add_argument("--stream", help="Uses pseudo streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true') parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true') parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true') + parser.add_argument("--forceversion", help="If the model file format detection fails (e.g. rogue modified model) you can set this to override the detected format (enter desired version, e.g. 401 for GPTNeoX-Type2).", type=int, default=0) parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true') parser.add_argument("--usemlock", help="For Apple Systems. Force system to keep model in RAM rather than swapping or compressing", action='store_true') parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true')