Skip to content

Commit

Permalink
run.c - Output buffering
Browse files Browse the repository at this point in the history
Previously much of time was spent writing to screen which is relatively slow.

By enabling output buffering more work can be performed by writing groups of computed tokens to the buffer which is relatively fast, and then flushing the buffer periodically to screen/console.

Testing with the smallest model, a interactive tokens/s speed up of ~14% on standard builds to ~84% on open-mp builds has been achieved.

Usage:

run <checkpoint_file> [temperature] [steps] [prompt] [buffer_tokens]

Where buffer_tokens is the number of tokens to be buffered.

Multiples of 2 seem to be ideal. 64 worked well for my use case on a low end machine.

The speed up may depend on model size and OS.

Example:

./run model.bin 0 0 "A car" 64
  • Loading branch information
trholding committed Jul 31, 2023
1 parent 1c4f129 commit f1ff0c8
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 4 deletions.
Binary file added run
Binary file not shown.
16 changes: 12 additions & 4 deletions run.c
Original file line number Diff line number Diff line change
Expand Up @@ -485,14 +485,15 @@ int main(int argc, char *argv[]) {
float temperature = 0.9f; // e.g. 1.0, or 0.0
int steps = 256; // max number of steps to run for, 0: use seq_len
char *prompt = NULL; // prompt string

// 'checkpoint' is necessary arg
int buffertokens = 1; // output token buffer size

#ifdef COSMO_ZIP // if this is defined
// we read the embedded checkpoint from within the executable
// 'checkpoint' is necessary arg
checkpoint = "/zip/out/model.bin" ;
#else
if (argc < 2) {
printf("Usage: %s <checkpoint_file> [temperature] [steps] [prompt]\n", argv[0]);
printf("Usage: %s <checkpoint_file> [temperature] [steps] [prompt] [buffer_tokens]\n", argv[0]);
return 1;
}
if (argc >= 2) {
Expand All @@ -508,6 +509,9 @@ int main(int argc, char *argv[]) {
if (argc >= 5) {
prompt = argv[4];
}
if (argc >= 6) {
buffertokens = atoi(argv[5]);
}
#endif

// seed rng with time. if you want deterministic behavior use temperature 0.0
Expand Down Expand Up @@ -583,7 +587,11 @@ int main(int argc, char *argv[]) {
int next; // will store the next token in the sequence
int token = 1; // init with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
int pos = 0; // position in the sequence
int bufferflush = 1; // buffer flush after token counter
char outbuff[2048]; // used for output buffering
memset( outbuff, '\0', sizeof( outbuff )); // clear buffer area
printf("<s>\n"); // explicit print the initial BOS token for stylistic symmetry reasons
setvbuf(stdout, outbuff, _IOFBF, 2048); // setup output buffering
while (pos < steps) {

// forward the transformer to get logits for the next token
Expand All @@ -610,7 +618,7 @@ int main(int argc, char *argv[]) {
// following BOS token (1), sentencepiece decoder strips any leading whitespace (see PR #89)
char *token_str = (token == 1 && vocab[next][0] == ' ') ? vocab[next]+1 : vocab[next];
printf("%s", token_str);
fflush(stdout);
if (bufferflush==pos) { fflush(stdout); bufferflush+=buffertokens; } // flush after every n tokens

// advance forward
token = next;
Expand Down

0 comments on commit f1ff0c8

Please sign in to comment.