Merge pull request ggerganov#1 from togethercomputer/support_redpajama

Support redpajama
syoyo · May 8, 2023 · ecd78a6 · ecd78a6
2 parents e129551 + 971f270
commit ecd78a6
Show file tree

Hide file tree

Showing 20 changed files with 5,939 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,10 @@ models/*
 /benchmark-matmult
 /vdot
 /Pipfile
+/redpajama-chat
+/redpajama
+/core
+/quantize-gptneox
 
 build-info.h
 arm_neon.h
@@ -45,3 +49,4 @@ zig-cache/
 ppl-*.txt
 
 examples/jeopardy/results.txt
+llama.dot
diff --git a/Makefile b/Makefile
@@ -221,6 +221,28 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+
+gptneox.o: examples/redpajama/gptneox.cpp ggml.h examples/redpajama/gptneox.h examples/redpajama/gptneox-util.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+common-gptneox.o: examples/redpajama/common-gptneox.cpp examples/redpajama/common-gptneox.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+quantize-gptneox: examples/redpajama/quantize-gptneox.cpp ggml.o gptneox.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
+redpajama: examples/redpajama/main-redpajama.cpp ggml.o gptneox.o common-gptneox.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+	@echo
+	@echo '====  Run ./redpajama -h for help.  ===='
+	@echo
+
+redpajama-chat: examples/redpajama/main-redpajama-chat.cpp ggml.o gptneox.o common-gptneox.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+	@echo
+	@echo '====  Run ./redpajama-chat -h for help.  ===='
+	@echo
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \

diff --git a/README.md b/README.md
@@ -257,6 +257,8 @@ Building the program with BLAS support may lead to some performance improvements
     cmake --build . --config Release
     ```
 
+Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
+
 ### Prepare Data & Run
 
 ```bash

diff --git a/examples/common.cpp b/examples/common.cpp
@@ -100,6 +100,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         arg = argv[i];
 
         if (arg == "-s" || arg == "--seed") {
+#if defined(GGML_USE_CUBLAS)
+            fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
+#endif
             if (++i >= argc) {
                 invalid_param = true;
                 break;

diff --git a/examples/redpajama/README.md b/examples/redpajama/README.md
@@ -0,0 +1,143 @@
+# gglm Support for RedPajama Model
+
+## Ackonwledgement 
+
+We highly appreciate the great effort from the fork of [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp). Our support of the RedPajama Model is mainly based on this implementation. We extend the model configure and fixed a bug when setting use_parallel_residual flag to False in their original implementation. We also extend the chat model for RedPajama.
+
+## Usage:
+
+### RedPajama Chat model:
+
+- Make the code:
+
+        make redpajama-chat quantize-gptneox
+
+
+- Prepare the RedPajama model (f16 and q4_0) for gglm:
+
+        bash ./examples/redpajama/scripts/install-RedPajama-INCITE-Chat-3B-v1.sh
+
+- Run RedPajama chat model (fp16):
+
+        ./redpajama-chat -m ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Chat-3B-v1-f16.bin \
+        -c 2048 \
+        -b 128 \
+        -n 1 \
+        -t 8 \
+        --instruct \
+        --color \
+        --top_k 30 \
+        --top_p 0.95 \
+        --temp 0.8 \
+        --repeat_last_n 3 \
+        --repeat_penalty 1.1 \
+        --seed 0
+
+    Note that you may need to install torch and transformers to run the above scripts, e.g.:
+
+        pip install torch==2.0.0
+        pip install transformers==4.28.1
+
+
+- Run RedPajama chat model (q4_0):
+
+        ./redpajama-chat -m ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Chat-3B-v1-q4_0.bin \
+        -c 2048 \
+        -b 128 \
+        -n 1 \
+        -t 8 \
+        --instruct \
+        --color \
+        --top_k 30 \
+        --top_p 0.95 \
+        --temp 0.8 \
+        --repeat_last_n 3 \
+        --repeat_penalty 1.1 \
+        --seed 0
+
+- Run other quantized version of RedPajama Chat model (Make sure you get the f16 model prepared before you run this):
+
+  - Make the code to quantize the model if you have not:
+
+        make quantize-gptneox
+
+  - Generate the quantized model, the supported types include: q4_0, q4_1, q4_2, q5_0, q5_1, and q8_0. For example, to run q4_1, you need to do the following convertion:
+
+        python ./examples/redpajama/scripts/quantize-gptneox.py ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Chat-3B-v1-f16.bin --quantize-output-type q4_1
+
+  - Then you can chat with the quantized model:
+
+        ./redpajama-chat -m ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Chat-3B-v1-q4_1.bin \
+        -c 2048 \
+        -b 128 \
+        -n 1 \
+        -t 8 \
+        --instruct \
+        --color \
+        --top_k 30 \
+        --top_p 0.95 \
+        --temp 0.8 \
+        --repeat_last_n 3 \
+        --repeat_penalty 1.1 \
+        --seed 0
+
+
+
+
+### RedPajama Base/Instruct model:
+
+- Make the code:
+
+        make redpajama quantize-gptneox
+
+
+- Prepare the RedPajama Base/Instruct model (f16 and q4_0) for gglm:
+
+        bash ./examples/redpajama/scripts/install-RedPajama-INCITE-Base-3B-v1.sh
+
+        # Or 
+
+        bash ./examples/redpajama/scripts/install-RedPajama-INCITE-Instruct-3B-v1.sh
+
+- Run other quantize version of RedPajama Base/Instruct model (Make sure you get the f16 model prepared before you run this). Then you can generate the quantized model, the supported types include: q4_0, q4_1, q4_2, q5_0, q5_1, and q8_0. For example, to run q4_1, you need to do the following convertion, e.g for RedPajama-Base q8_0:
+
+        python ./examples/redpajama/scripts/quantize-gptneox.py ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Base-3B-v1-f16.bin --quantize-output-type q8_0
+
+- Run RedPajama Base/Instruct model (e.g., RedPajama-Instruct q8_0) :
+
+        ./redpajama -m ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Instruct-3B-v1-q8_0.bin \
+        -c 2048 \
+        -b 128 \
+        -n 1 \
+        -t 8 \
+        --color \
+        --top_k 30 \
+        --top_p 0.95 \
+        --temp 0.8 \
+        --repeat_last_n 3 \
+        --repeat_penalty 1.1 \
+        --seed 0 \
+        --n_predict 256 \
+        --verbose-prompt \
+        -p "How to schedule a tour to Anfield:"
+
+
+## Attribution
+
+The following files are covered by a MIT license and were taken from:
+
+https://github.com/byroneverson/gptneox.cpp
+
+Thank you Byron.
+
+```
+common-gptneox.cpp	
+copy-gptneox.cpp	
+gptneox.cpp		
+quantize-gptneox.cpp
+common-gptneox.h	
+gptneox-util.h		
+gptneox.h
+convert_gptneox_to_ggml.py
+quantize-gptneox.py
+```