diff --git a/.gitignore b/.gitignore
index 2fc06ea7..f573c1cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -135,3 +135,10 @@ debug-tmp/
 wandb/
 results/
 .vscode/
+
+# defined by rui
+DS_1000/
+.DS_Store
+NLP4Code_humaneval_outputs
+NLP4Code_ds1000_outputs
+raw_output_evaluation.py
diff --git a/README.md b/README.md
index 27fad5e1..4d06372f 100644
--- a/README.md
+++ b/README.md
@@ -1,61 +1,11 @@
-# NLP4Code
-Repository for the NLP4Code project at the LILY lab.
+Installing Human Eval
 
-## Installation
-*[Recommended]* Create a virtualenv or conda enviroment  
-```bash
-conda create -n nlp4code python=3.8
-conda activate nlp4code
 ```
-Then, install the dependencies:
-```bash
-pip install -r requirements.txt
-```
-*(Optional)* At any point, if you met with the Python import problem (e.g., `ModuleNotFoundError`), try doing this in the main (`NLP4Code`) directory:
-```bash
-export PYTHONPATH=`pwd`
-```
-
-## Wandb
-We use Wandb for experiment tracking. Please register ask Ansong for an invitation to the Wandb Yale-LILY team before 
-running experiments. When you are ready to run the exps and log it to the cloud, do the following:
-```
-wandb login
-```
-Paste your API key and the login is complete. When start running experiments, you should see something like 
-```
-wandb: Tracking run with wandb version 0.12.11
-wandb: Run data is saved locally in /home/ansongni/Code/NLP4Code/wandb/run-20220309_150158-1ebacxm4
-wandb: Run `wandb offline` to turn off syncing.
-wandb: Syncing run mathqa-gpt-finetuning
-wandb: ⭐️ View project at https://wandb.ai/yale-lily/unified-codegen
-wandb: 🚀 View run at https://wandb.ai/yale-lily/unified-codegen/runs/1ebacxm4
+git clone https://github.com/openai/human-eval
+pip install -e human-eval
 ```
+Creating JSONL files
 
-If you want to do some test runs without logging to the cloud, run `wandb offline` first as suggested above. 
-
-## Naming of the experiments
-In the $*.yaml$ configuration file, you should see a line like
-```
-default_root_dir: &exp_name results/mathqa-gpt_neo_1.3B-finetuning
-```
-We automatically get the experiment name by the string after `/`, the tags for the experiments are automatically
-generated by spliting that string by `-`. In this case, the experiment will be named `mathqa-gpt_neo_1.3B-finetuning`
-and the tags will be `["mathqa", "gpt_neo_1.3B", "finetuning"]`. Please follow this convention so that we can write all
-of this in one place.
-
-## Fine-tuning
-(Read the previous sections first if you are ready to run experiments)
-For fine-tuning, in the main directory, do:
-```
-python finetuning/trainer.py fit --config finetuning/training_configs/*.yaml
 ```
-
-## Testing
-There are some basic tests in the `tests` folder, to run all the tests (follow [this link](https://docs.python.org/3/library/unittest.html#command-line-interface) for more):
-To run tests, do
-```bash
-python -m unittest discover <test_directory>
-# or
-python -m unittest discover -s <directory> -p '*_test.py'
+python preprocess/preprocess_humaneval.py
 ```
\ No newline at end of file
diff --git a/execution/executors.py b/execution/executors.py
index 92c5c15d..a02aa6d7 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -14,6 +14,9 @@
 from execution.safe_execution_util import execute
 from execution.program_tracing import get_function_final_state
 
+from human_eval.execution import check_correctness
+
+
 """
 From the models' perspective, the model would only want two things: 
     1) if the execution result is right; 
@@ -168,7 +171,11 @@ def gold_program_len(self, example: Dict[str, Any]) -> int:
 
     @overrides
     def process_output(self, output: str, tokenizer_eos_token: str) -> str:
-        return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].split(";")[0].strip()
+        if not tokenizer_eos_token:
+            # for llama-based model
+            return output.lstrip().split("\n\n")[0].split(";")[0].strip()
+        else:
+            return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].split(";")[0].strip()
 
     @overrides
     def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:
@@ -314,7 +321,11 @@ def gold_program_len(self, example: Dict[str, Any]) -> int:
 
     @overrides
     def process_output(self, output: str, tokenizer_eos_token: str) -> str:
-        return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].strip()
+        if not tokenizer_eos_token:
+            # for llama-based model
+            return output.lstrip().split("\n\n")[0].strip()
+        else:
+            return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].strip()
 
     @overrides
     def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:
@@ -355,4 +366,61 @@ def real_exec_program(cls, program: str, example: Dict[str, Any]) -> Tuple[int,
             executed_answer = "ERROR: program failed to execute"
             exec_match = -1
 
-        return exec_match, executed_answer
\ No newline at end of file
+        return exec_match, executed_answer
+    
+
+class HumanEvalExecutor(BaseExecutor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @overrides
+    def cache_key_func(self, program: str, example: Dict[str, Any]) -> str:
+        return example["prompt"] + " | "  +  program
+
+    @overrides
+    def program_len(self, program: str) -> int:
+        return python_program_len(program)
+
+    @overrides
+    def gold_program_len(self, example: Dict[str, Any]) -> int:
+        return self.program_len(example["canonical_solution"])
+
+    # TODO: modify this later based on generated programs
+    @overrides
+    def process_output(self, output: str, tokenizer_eos_token: str) -> str:
+        stop_sequence = [ '\nclass', '\ndef', '\n#', '\nif', '\nprint']
+        min_index = len(output)  # Initialize with a large value
+        for substring in stop_sequence:
+            index = output.find(substring)
+            if index != -1 and index < min_index:
+                min_index = index
+ 
+        if min_index < len(output):
+            processed_output = output[:min_index]
+        else:
+            processed_output = output
+
+        # for llama, gpt4_alpaca_lora, alpaca_lora_7b, the model output may be missing a space
+        if processed_output.startswith("   "):
+            processed_output = " " + processed_output
+
+        return processed_output
+
+    @overrides
+    def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:
+        return (program_dict_1['exec_result'] and (program_dict_1['exec_result'] == program_dict_2['exec_result']))
+
+    @classmethod
+    def real_exec_program(cls, program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
+        eval_dict = example
+        metadata = eval_dict.pop('metadata')
+        eval_dict.update(metadata)
+
+        result_dict = check_correctness(eval_dict, program, timeout=5)
+        exec_match = result_dict['passed']
+        exec_result = result_dict['result']
+        
+        if exec_match < 1 and exec_result.strip() != "failed:":
+            exec_match = -1
+
+        return exec_match, exec_result
diff --git a/finetuning/lightning_modules/datasets/base_datamodule.py b/finetuning/lightning_modules/datasets/base_datamodule.py
index d2cd9de9..2343cef3 100644
--- a/finetuning/lightning_modules/datasets/base_datamodule.py
+++ b/finetuning/lightning_modules/datasets/base_datamodule.py
@@ -10,6 +10,7 @@
 from finetuning.lightning_modules.datasets.spider_reader import FewShotSpiderDataset, SpiderDataset
 from finetuning.lightning_modules.datasets.mathqa_reader import FewShotMathQADataset, MathQADataset
 from finetuning.lightning_modules.datasets.mbpp_reader import FewShotMBPPDataset
+from finetuning.lightning_modules.datasets.humaneval_reader import FewShotHumanEvalDataset
 
 from finetuning.lightning_modules.models.seq2seq_model_util import is_model_gpt_style
 from finetuning.lightning_modules.models.seq2seq_model_util import left_pad_sequences, right_pad_sequences
diff --git a/finetuning/lightning_modules/datasets/humaneval_reader.py b/finetuning/lightning_modules/datasets/humaneval_reader.py
new file mode 100644
index 00000000..4cceaf94
--- /dev/null
+++ b/finetuning/lightning_modules/datasets/humaneval_reader.py
@@ -0,0 +1,35 @@
+import re
+import os
+import pandas as pd
+
+from overrides import overrides
+
+from typing import Dict, Iterable, List, Any, Optional, Union, Tuple
+
+from finetuning.lightning_modules.datasets.base_reader import NL2CodeDataset, FewShotNL2CodeDataset
+from execution.program_tracing import assertion_to_test
+
+from human_eval.data import write_jsonl, read_problems
+
+
+class FewShotHumanEvalDataset(FewShotNL2CodeDataset):
+
+    instruction: str = ""
+    example_io_sep: str = "\n"
+
+    @overrides
+    def get_test_instance(self, example: Dict[str, Any]) -> List[Dict[str, Any]]:
+        context = self.get_prompt_for_example(example)
+
+        return [self.get_example_dict(example, context, train_mode=False)]
+
+    # @overrides
+    def promptify_example(self, example: Dict[str, Any], add_code: bool = True, 
+                          add_assertion_n: int = 0, test_input_only: bool = False) -> Tuple[str, str]:
+
+        header = example["prompt"]
+
+        if add_code:
+            return header, f'{example["canonical_solution"]}\n\n'
+        else:
+            return header, ''
\ No newline at end of file
diff --git a/finetuning/lightning_modules/models/openai_model.py b/finetuning/lightning_modules/models/openai_model.py
index 8c8e7cf3..000322ae 100644
--- a/finetuning/lightning_modules/models/openai_model.py
+++ b/finetuning/lightning_modules/models/openai_model.py
@@ -80,10 +80,10 @@ def prune_none_args(**kwargs):
                                             temperature=temperature, top_p=top_p, n=n, best_of=best_of, 
                                             stop=stop, **kwargs)
             
-            if engine.startswith("gpt-3.5-turbo"):
+            if engine.startswith("gpt-3.5-turbo") or engine.startswith("gpt-4"):
                 non_none_args.pop("prompt")
                 non_none_args.pop("engine")
-                assert len(prompts) == 1, "gpt-3.5-turbo only supports one prompt at a time"
+                assert len(prompts) == 1, "gpt-3.5-turbo or gpt-4 only supports one prompt at a time"
                 if use_chat_format:
                     non_none_args["messages"] = prompt_to_chatgpt_format(prompts[0])
                 else:
@@ -115,7 +115,7 @@ def prune_none_args(**kwargs):
             time.sleep(60 * 5)
 
     # get the text from the returned results and slice the completions to input_n * completion_n
-    if engine.startswith("gpt-3.5-turbo"):
+    if engine.startswith("gpt-3.5-turbo") or engine.startswith("gpt-4"):
         completion_texts = [x['message']['content'] for x in completion.choices]
     else:
         completion_texts = [x.text for x in completion.choices]
@@ -141,7 +141,7 @@ def __init__(self,
                  ) -> None:
         SUPPORTED_OPENAI_MODELS = ["code-davinci-002", "code-cushman-002", 
                                    "code-cushman-001", "code-davinci-001", 
-                                   "gpt-3.5-turbo"]
+                                   "gpt-3.5-turbo", "text-davinci-003", "text-davinci-002","gpt-4"]
         assert engine in SUPPORTED_OPENAI_MODELS, f"OpenAIModel only supports {SUPPORTED_OPENAI_MODELS}"
 
         self.engine = engine
diff --git a/finetuning/lightning_modules/models/seq2seq_model.py b/finetuning/lightning_modules/models/seq2seq_model.py
index 04594361..5b7acaf4 100644
--- a/finetuning/lightning_modules/models/seq2seq_model.py
+++ b/finetuning/lightning_modules/models/seq2seq_model.py
@@ -75,9 +75,9 @@ def __init__(self,
         # We only instantiate this when we need it.
         self.transformer_model_name = transformer_model_name
         if "openai" in self.transformer_model_name:
-            if self.transformer_model_name.startswith("openai/gpt-3.5-turbo"):
+            if self.transformer_model_name.startswith("openai/gpt-3.5-turbo") or self.transformer_model_name.startswith("openai/gpt-4"):
                 if self.save_raw_generation_results:
-                    print("get_raw_generation_results is not supported for gpt-3.5-turbo, set to False instead")
+                    print("get_raw_generation_results is not supported for gpt-3.5-turbo and gpt-4, set to False instead")
                 self.save_raw_generation_results = False
             transformer_model_init_args["save_raw_generation_results"] = self.save_raw_generation_results
             transformer_model_init_args["use_chat_format"] = self.use_chat_format
@@ -155,6 +155,10 @@ def generate_and_post_process(self,
             num_beam = 1
             temp = temperature
 
+        # https://github.com/THUDM/ChatGLM-6B/issues/31
+        if "santacoder" in self.transformer_model_name or "gpt-neox-20b" in self.transformer_model_name or "replit" in self.transformer_model_name:
+            use_sample = False
+
         generation_results = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=use_sample, 
                                                   max_new_tokens=self.max_gen_len, num_beams=num_beam,
                                                   temperature=temp, num_return_sequences=num_return_sequences,
@@ -387,6 +391,7 @@ def validation_epoch_end(self, outputs: List[Dict[str, Any]]) -> None:
         # save the predictions
         save_pred_file_path = os.path.join(self.trainer.log_dir,
                                 f'predictions_step_{self.trainer.global_step}_rank_{self.trainer.global_rank}.jsonl')
+        os.makedirs(os.path.dirname(save_pred_file_path), exist_ok=True)
         with open(save_pred_file_path, 'w+') as f:
             for prediction in self.predictions:
                 f.write(json.dumps(prediction)+'\n')
diff --git a/finetuning/lightning_modules/models/seq2seq_model_util.py b/finetuning/lightning_modules/models/seq2seq_model_util.py
index 6bc49bfb..ad13c97b 100755
--- a/finetuning/lightning_modules/models/seq2seq_model_util.py
+++ b/finetuning/lightning_modules/models/seq2seq_model_util.py
@@ -13,6 +13,7 @@
 from transformers import CodeGenTokenizer, CodeGenForCausalLM, T5Tokenizer
 from transformers import BartTokenizer, BartModel, BartForConditionalGeneration
 from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
+from transformers import LlamaTokenizer, LlamaForCausalLM
 
 from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerFast
 
@@ -55,7 +56,7 @@ def get_model(model_name: str,
             model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
-    elif model_name == "EleutherAI/gpt-j-6B":
+    elif model_name == "EleutherAI/gpt-j-6b":
         tokenizer = GPT2Tokenizer.from_pretrained(model_name)
         tokenizer.pad_token = tokenizer.eos_token
 
@@ -64,6 +65,14 @@ def get_model(model_name: str,
                                                         gradient_checkpointing=gradient_ckpt, use_cache=not gradient_ckpt)
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
+    elif model_name in ["EleutherAI/gpt-neox-20b", "EleutherAI/pythia-1.4b-deduped", "EleutherAI/pythia-6.9b-deduped", "EleutherAI/pythia-12b-deduped", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        if not tokenizer_only:
+            model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
+            if len(additional_special_tokens) > 0:
+                model.resize_token_embeddings(len(tokenizer))
     elif model_name in ["EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-125M", "EleutherAI/gpt-neo-2.7B"]:
         tokenizer = GPT2Tokenizer.from_pretrained(model_name, additional_special_tokens=additional_special_tokens)
         tokenizer.pad_token = tokenizer.eos_token
@@ -136,7 +145,55 @@ def get_model(model_name: str,
 
         if not tokenizer_only:
             model = BartForSequenceClassification.from_pretrained(model_name, num_labels=2)
-                                                    
+    elif "llama" in model_name.lower() or "alpaca" in model_name.lower():
+        tokenizer = LlamaTokenizer.from_pretrained(model_name,
+                                                    additional_special_tokens=additional_special_tokens)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        if not tokenizer_only:
+            model = LlamaForCausalLM.from_pretrained(model_name, 
+                                                    pad_token_id=tokenizer.eos_token_id, 
+                                                    torch_dtype=torch.float16)
+            if len(additional_special_tokens) > 0:
+                model.resize_token_embeddings(len(tokenizer))
+    elif model_name == "bigcode/santacoder":
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                    additional_special_tokens=additional_special_tokens)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        if not tokenizer_only:
+            model = AutoModelForCausalLM.from_pretrained(model_name,        
+                                                        pad_token_id=tokenizer.eos_token_id, 
+                                                        torch_dtype=torch.float32,
+                                                        trust_remote_code=True, 
+                                                        )
+            if len(additional_special_tokens) > 0:
+                model.resize_token_embeddings(len(tokenizer))
+    elif model_name in ["bigcode/starcoder", "HuggingFaceH4/starchat-alpha"]:
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                    additional_special_tokens=additional_special_tokens)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        if not tokenizer_only:
+            model = AutoModelForCausalLM.from_pretrained(model_name,        
+                                                        pad_token_id=tokenizer.eos_token_id, 
+                                                        torch_dtype=torch.float16,
+                                                        trust_remote_code=True)
+            if len(additional_special_tokens) > 0:
+                model.resize_token_embeddings(len(tokenizer))
+    elif model_name == "replit/replit-code-v1-3b":
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                    additional_special_tokens=additional_special_tokens,
+                                                    trust_remote_code=True)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        if not tokenizer_only:
+            model = AutoModelForCausalLM.from_pretrained(model_name,        
+                                                        pad_token_id=tokenizer.eos_token_id, 
+                                                        torch_dtype=torch.float16,
+                                                        trust_remote_code=True)
+            if len(additional_special_tokens) > 0:
+                model.resize_token_embeddings(len(tokenizer))
     elif model_name.startswith("openai/"):
         engine = model_name.split("/")[-1]
 
@@ -144,7 +201,7 @@ def get_model(model_name: str,
         tokenizer.pad_token = tokenizer.eos_token
 
         # to accomandate the length of openai models and the prompt
-        if engine in ["code-davinci-002"]:
+        if engine in ["code-davinci-002", "gpt-4"]:
             model_length = 8001
         elif engine in ["code-cushman-001", "code-cushman-002"]:
             model_length = 1024
diff --git a/finetuning/training_configs/few_shot/humaneval.yaml b/finetuning/training_configs/few_shot/humaneval.yaml
new file mode 100644
index 00000000..7f539ceb
--- /dev/null
+++ b/finetuning/training_configs/few_shot/humaneval.yaml
@@ -0,0 +1,62 @@
+seed_everything: 333
+trainer:
+  default_root_dir: &exp_name results/debug-tmp
+  # progress_bar_refresh_rate: 1
+  num_sanity_val_steps: 0
+  log_every_n_steps: 1
+  logger+:
+    - class_path: finetuning.lightning_modules.patches.patched_loggers.PatchedWandbLogger
+      init_args:
+        entity: yale-lily
+        project: unified-codegen
+        save_dir: *exp_name
+        name: *exp_name
+        log_model: False
+        save_code: True
+        offline: False
+        # offline: True
+  callbacks+:
+    - class_path: pytorch_lightning.callbacks.progress.TQDMProgressBar
+      init_args:
+        refresh_rate: 1
+        
+  accelerator: gpu
+  devices: 1
+  # strategy: deepspeed_stage_2
+  strategy: ddp_find_unused_parameters_false
+  precision: 16
+
+model:
+  class_path: finetuning.lightning_modules.models.seq2seq_model.Seq2SeqModel
+  init_args:
+    transformer_model_name: default-will-cause-error
+    executor_cls: execution.executors.HumanEvalExecutor
+    max_gen_len: 256
+    sampling_temp: 0.001
+    # sampling_temp_at_k: 0.8
+    # pass_at_k: 50
+    # max_generation_batches: 5
+    gradient_ckpt: false
+    save_raw_generation_results: true
+    # print_eval_every_n_batches: 1
+
+data:
+  class_path: finetuning.lightning_modules.datasets.base_datamodule.FewShotNL2CodeDataModule
+  init_args:
+    transformer_model_name: default-will-cause-error
+    dataset_cls: FewShotHumanEvalDataset
+    batch_size: 1
+    val_batch_size: 4
+    ## prompting settings
+    prompting_init_args:
+      exemplar_file_path: prompt_files/humaneval-8_exemplars.jsonl
+      num_exemplars: 8
+      fixed_exemplars: true
+      exemplar_selection_method: first
+      add_instruction: true
+      use_chat_format: false
+    additional_prompt_func_args:
+      add_assertion_n: 1
+      test_input_only: false
+    val_set_init_args:
+      file_path: data/humaneval/humaneval.jsonl
\ No newline at end of file
diff --git a/finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml b/finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml
new file mode 100644
index 00000000..c93de965
--- /dev/null
+++ b/finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml
@@ -0,0 +1,100 @@
+seed_everything: 333
+trainer:
+  gpus: 2
+  gradient_clip_val: 1.0
+  default_root_dir: &exp_name results/debug-tmp-128
+  # default_root_dir: &exp_name results/gsmath-incoder_6b-few_shot-pass_at_50-train-output_prob-gen_len_256-gsm_shots-split_0
+  val_check_interval: 1.0
+  max_steps: &max_steps 25000
+  # progress_bar_refresh_rate: 1
+  num_sanity_val_steps: 0
+  log_every_n_steps: 1
+  logger+:
+    - class_path: finetuning.lightning_modules.patches.patched_loggers.PatchedWandbLogger
+      init_args:
+        entity: yilunzhao
+        project: unified-codegen
+        save_dir: *exp_name
+        name: *exp_name
+        log_model: False
+        save_code: True
+        offline: False
+        # offline: True
+  callbacks+:
+    - class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
+      init_args:
+        # monitor: exec_acc
+        monitor: pass@100
+        mode: max
+        # filename: '{step}-{exec_acc:.4f}-{exec_rate:.4f}'
+        filename: '{step}-{pass@100:.4f}-{exec_acc:.4f}'
+        save_top_k: 3
+        save_last: True
+    - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+      init_args:
+        logging_interval: step
+    - class_path: pytorch_lightning.callbacks.progress.TQDMProgressBar
+      init_args:
+        refresh_rate: 1
+    # - class_path: pytorch_lightning.callbacks.gpu_stats_monitor.GPUStatsMonitor
+    #   init_args:
+    #     memory_utilization: true
+    #     gpu_utilization: true
+        
+  accelerator: gpu
+  # replace_sampler_ddp: False
+  # https://github.com/PyTorchLightning/pytorch-lightning/issues/8262
+  # strategy: deepspeed_stage_2
+  # strategy: deepspeed_stage_2
+  strategy: ddp_find_unused_parameters_false
+  precision: 16
+  # accumulate_grad_batches: 2
+
+model:
+  class_path: lightning_modules.models.seq2seq_model.Seq2SeqModel
+  init_args:
+    transformer_model_name: &transformer decapoda-research/llama-7b-hf
+    executor_cls: execution.executors.MathExecutor
+    max_gen_len: 256
+    sampling_temp: 0.001
+    # sampling_temp_at_k: 0.8
+    # pass_at_k: 50
+    # max_generation_batches: 5
+    gradient_ckpt: false
+    save_raw_generation_results: true
+    # print_eval_every_n_batches: 1
+    optimizer:
+      init_args: 
+        # lr: 5.0e-5
+        lr: 0.0
+        betas: 
+          - 0.9
+          - 0.999
+        eps: 1.0e-8
+        weight_decay: 0.1
+    lr_scheduler:
+      name: linear
+      init_args:
+        num_warmup_steps: 100
+        num_training_steps: *max_steps
+
+data:
+  class_path: lightning_modules.datasets.base_datamodule.FewShotNL2CodeDataModule
+  init_args:
+    transformer_model_name: *transformer
+    dataset_cls: FewShotMathQADataset
+    batch_size: 1
+    val_batch_size: 4
+    ## prompting settings
+    prompting_init_args:
+      exemplar_file_path: prompt_files/mathqa-non_idiomatic_code-annotated-8_exemplars.jsonl
+      num_exemplars: 8
+      fixed_exemplars: true
+      exemplar_selection_method: first
+      add_instruction: true
+      use_chat_format: false
+    # val_max_instances: 64
+    val_set_init_args:
+      file_path: data/mathqa/val_dedup_init_val.jsonl
+
+# clear; export PYTHONPATH=`pwd`; python finetuning/new_trainer.py validate --config finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml
\ No newline at end of file
diff --git a/preprocessing/preprocess_humaneval.py b/preprocessing/preprocess_humaneval.py
new file mode 100644
index 00000000..6e39e1d1
--- /dev/null
+++ b/preprocessing/preprocess_humaneval.py
@@ -0,0 +1,34 @@
+import json
+import os
+import random
+from human_eval.data import write_jsonl, read_problems
+
+file_path = os.getcwd() + "/data/humaneval/humaneval.jsonl"
+os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+problems = read_problems()
+result_jsonl = []
+metadata_keys = ['task_id', 'entry_point', 'test']
+for task_id in problems:
+    new_dict = {}
+    metadata = {}
+    for key in metadata_keys:
+        metadata[key] = problems[task_id][key]
+    new_dict['metadata'] = metadata
+    new_dict['prompt'] = problems[task_id]['prompt']
+    new_dict['canonical_solution'] = problems[task_id]['canonical_solution']
+    result_jsonl.append(new_dict)
+
+write_jsonl(file_path, result_jsonl)
+
+selected_problems = []
+with open(file_path, 'r') as file:
+    for line in file:
+        json_obj = json.loads(line)
+        selected_problems.append(json_obj)
+selected_problems = random.sample(selected_problems, 8)
+
+file_path = os.getcwd() + "/prompt_files/humaneval-8_examplars.jsonl"
+os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+write_jsonl(file_path, selected_problems)
diff --git a/prompt_files/humaneval-8_exemplars.jsonl b/prompt_files/humaneval-8_exemplars.jsonl
new file mode 100644
index 00000000..726cface
--- /dev/null
+++ b/prompt_files/humaneval-8_exemplars.jsonl
@@ -0,0 +1,8 @@
+{"metadata": {"entry_point": "fib4", "test": "\n\nMETADATA = {}\n\n\ndef check(candidate):\n    assert candidate(5) == 4\n    assert candidate(8) == 28\n    assert candidate(10) == 104\n    assert candidate(12) == 386\n\n"}, "task_id": "HumanEval/46", "prompt": "\n\ndef fib4(n: int):\n    \"\"\"The Fib4 number sequence is a sequence similar to the Fibbonacci sequnece that's defined as follows:\n    fib4(0) -> 0\n    fib4(1) -> 0\n    fib4(2) -> 2\n    fib4(3) -> 0\n    fib4(n) -> fib4(n-1) + fib4(n-2) + fib4(n-3) + fib4(n-4).\n    Please write a function to efficiently compute the n-th element of the fib4 number sequence.  Do not use recursion.\n    >>> fib4(5)\n    4\n    >>> fib4(6)\n    8\n    >>> fib4(7)\n    14\n    \"\"\"\n", "canonical_solution": "    results = [0, 0, 2, 0]\n    if n < 4:\n        return results[n]\n\n    for _ in range(4, n + 1):\n        results.append(results[-1] + results[-2] + results[-3] + results[-4])\n        results.pop(0)\n\n    return results[-1]\n"}
+{"metadata": {"entry_point": "max_fill", "test": "def check(candidate):\n\n\n    # Check some simple cases\n    assert True, \"This prints if this assert fails 1 (good for debugging!)\"\n    assert candidate([[0,0,1,0], [0,1,0,0], [1,1,1,1]], 1) == 6, \"Error\"\n    assert candidate([[0,0,1,1], [0,0,0,0], [1,1,1,1], [0,1,1,1]], 2) == 5, \"Error\"\n    assert candidate([[0,0,0], [0,0,0]], 5) == 0, \"Error\"\n\n    # Check some edge cases that are easy to work out by hand.\n    assert True, \"This prints if this assert fails 2 (also good for debugging!)\"\n    assert candidate([[1,1,1,1], [1,1,1,1]], 2) == 4, \"Error\"\n    assert candidate([[1,1,1,1], [1,1,1,1]], 9) == 2, \"Error\"\n\n"}, "task_id": "HumanEval/115", "prompt": "\ndef max_fill(grid, capacity):\n    import math\n    \"\"\"\n    You are given a rectangular grid of wells. Each row represents a single well,\n    and each 1 in a row represents a single unit of water.\n    Each well has a corresponding bucket that can be used to extract water from it, \n    and all buckets have the same capacity.\n    Your task is to use the buckets to empty the wells.\n    Output the number of times you need to lower the buckets.\n\n    Example 1:\n        Input: \n            grid : [[0,0,1,0], [0,1,0,0], [1,1,1,1]]\n            bucket_capacity : 1\n        Output: 6\n\n    Example 2:\n        Input: \n            grid : [[0,0,1,1], [0,0,0,0], [1,1,1,1], [0,1,1,1]]\n            bucket_capacity : 2\n        Output: 5\n    \n    Example 3:\n        Input: \n            grid : [[0,0,0], [0,0,0]]\n            bucket_capacity : 5\n        Output: 0\n\n    Constraints:\n        * all wells have the same length\n        * 1 <= grid.length <= 10^2\n        * 1 <= grid[:,1].length <= 10^2\n        * grid[i][j] -> 0 | 1\n        * 1 <= capacity <= 10\n    \"\"\"\n", "canonical_solution": "    return sum([math.ceil(sum(arr)/capacity) for arr in grid])\n"}
+{"metadata": {"entry_point": "separate_paren_groups", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert candidate('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert candidate('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n"}, "task_id": "HumanEval/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n"}
+{"metadata": {"entry_point": "filter_integers", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == []\n    assert candidate([4, {}, [], 23.2, 9, 'adasd']) == [4, 9]\n    assert candidate([3, 'c', 3, 3, 'a', 'b']) == [3, 3, 3]\n"}, "task_id": "HumanEval/22", "prompt": "from typing import List, Any\n\n\ndef filter_integers(values: List[Any]) -> List[int]:\n    \"\"\" Filter given list of any python values only for integers\n    >>> filter_integers(['a', 3.14, 5])\n    [5]\n    >>> filter_integers([1, 2, 3, 'abc', {}, []])\n    [1, 2, 3]\n    \"\"\"\n", "canonical_solution": "    return [x for x in values if isinstance(x, int)]\n"}
+{"metadata": {"entry_point": "encode", "test": "def check(candidate):\n\n    # Check some simple cases\n    assert candidate('TEST') == 'tgst', \"This prints if this assert fails 1 (good for debugging!)\"\n    assert candidate('Mudasir') == 'mWDCSKR', \"This prints if this assert fails 2 (good for debugging!)\"\n    assert candidate('YES') == 'ygs', \"This prints if this assert fails 3 (good for debugging!)\"\n    \n    # Check some edge cases that are easy to work out by hand.\n    assert candidate('This is a message') == 'tHKS KS C MGSSCGG', \"This prints if this assert fails 2 (also good for debugging!)\"\n    assert candidate(\"I DoNt KnOw WhAt tO WrItE\") == 'k dQnT kNqW wHcT Tq wRkTg', \"This prints if this assert fails 2 (also good for debugging!)\"\n\n"}, "task_id": "HumanEval/93", "prompt": "\ndef encode(message):\n    \"\"\"\n    Write a function that takes a message, and encodes in such a \n    way that it swaps case of all letters, replaces all vowels in \n    the message with the letter that appears 2 places ahead of that \n    vowel in the english alphabet. \n    Assume only letters. \n    \n    Examples:\n    >>> encode('test')\n    'TGST'\n    >>> encode('This is a message')\n    'tHKS KS C MGSSCGG'\n    \"\"\"\n", "canonical_solution": "    vowels = \"aeiouAEIOU\"\n    vowels_replace = dict([(i, chr(ord(i) + 2)) for i in vowels])\n    message = message.swapcase()\n    return ''.join([vowels_replace[i] if i in vowels else i for i in message])\n"}
+{"metadata": {"entry_point": "valid_date", "test": "def check(candidate):\n\n    # Check some simple cases\n    assert candidate('03-11-2000') == True\n\n    assert candidate('15-01-2012') == False\n\n    assert candidate('04-0-2040') == False\n\n    assert candidate('06-04-2020') == True\n\n    assert candidate('01-01-2007') == True\n\n    assert candidate('03-32-2011') == False\n\n    assert candidate('') == False\n\n    assert candidate('04-31-3000') == False\n\n    assert candidate('06-06-2005') == True\n\n    assert candidate('21-31-2000') == False\n\n    assert candidate('04-12-2003') == True\n\n    assert candidate('04122003') == False\n\n    assert candidate('20030412') == False\n\n    assert candidate('2003-04') == False\n\n    assert candidate('2003-04-12') == False\n\n    assert candidate('04-2003') == False\n"}, "task_id": "HumanEval/124", "prompt": "\ndef valid_date(date):\n    \"\"\"You have to write a function which validates a given date string and\n    returns True if the date is valid otherwise False.\n    The date is valid if all of the following rules are satisfied:\n    1. The date string is not empty.\n    2. The number of days is not less than 1 or higher than 31 days for months 1,3,5,7,8,10,12. And the number of days is not less than 1 or higher than 30 days for months 4,6,9,11. And, the number of days is not less than 1 or higher than 29 for the month 2.\n    3. The months should not be less than 1 or higher than 12.\n    4. The date should be in the format: mm-dd-yyyy\n\n    for example: \n    valid_date('03-11-2000') => True\n\n    valid_date('15-01-2012') => False\n\n    valid_date('04-0-2040') => False\n\n    valid_date('06-04-2020') => True\n\n    valid_date('06/04/2020') => False\n    \"\"\"\n", "canonical_solution": "    try:\n        date = date.strip()\n        month, day, year = date.split('-')\n        month, day, year = int(month), int(day), int(year)\n        if month < 1 or month > 12:\n            return False\n        if month in [1,3,5,7,8,10,12] and day < 1 or day > 31:\n            return False\n        if month in [4,6,9,11] and day < 1 or day > 30:\n            return False\n        if month == 2 and day < 1 or day > 29:\n            return False\n    except:\n        return False\n\n    return True\n"}
+{"metadata": {"entry_point": "rounded_avg", "test": "def check(candidate):\n\n    # Check some simple cases\n    assert candidate(1, 5) == \"0b11\"\n    assert candidate(7, 13) == \"0b1010\"\n    assert candidate(964,977) == \"0b1111001010\"\n    assert candidate(996,997) == \"0b1111100100\"\n    assert candidate(560,851) == \"0b1011000010\"\n    assert candidate(185,546) == \"0b101101110\"\n    assert candidate(362,496) == \"0b110101101\"\n    assert candidate(350,902) == \"0b1001110010\"\n    assert candidate(197,233) == \"0b11010111\"\n\n\n    # Check some edge cases that are easy to work out by hand.\n    assert candidate(7, 5) == -1\n    assert candidate(5, 1) == -1\n    assert candidate(5, 5) == \"0b101\"\n\n"}, "task_id": "HumanEval/103", "prompt": "\ndef rounded_avg(n, m):\n    \"\"\"You are given two positive integers n and m, and your task is to compute the\n    average of the integers from n through m (including n and m). \n    Round the answer to the nearest integer and convert that to binary.\n    If n is greater than m, return -1.\n    Example:\n    rounded_avg(1, 5) => \"0b11\"\n    rounded_avg(7, 5) => -1\n    rounded_avg(10, 20) => \"0b1111\"\n    rounded_avg(20, 33) => \"0b11010\"\n    \"\"\"\n", "canonical_solution": "    if m < n:\n        return -1\n    summation = 0\n    for i in range(n, m+1):\n        summation += i\n    return bin(round(summation/(m - n + 1)))\n"}
+{"metadata": {"entry_point": "Strongest_Extension", "test": "def check(candidate):\n\n    # Check some simple cases\n    assert candidate('Watashi', ['tEN', 'niNE', 'eIGHt8OKe']) == 'Watashi.eIGHt8OKe'\n    assert candidate('Boku123', ['nani', 'NazeDa', 'YEs.WeCaNe', '32145tggg']) == 'Boku123.YEs.WeCaNe'\n    assert candidate('__YESIMHERE', ['t', 'eMptY', 'nothing', 'zeR00', 'NuLl__', '123NoooneB321']) == '__YESIMHERE.NuLl__'\n    assert candidate('K', ['Ta', 'TAR', 't234An', 'cosSo']) == 'K.TAR'\n    assert candidate('__HAHA', ['Tab', '123', '781345', '-_-']) == '__HAHA.123'\n    assert candidate('YameRore', ['HhAas', 'okIWILL123', 'WorkOut', 'Fails', '-_-']) == 'YameRore.okIWILL123'\n    assert candidate('finNNalLLly', ['Die', 'NowW', 'Wow', 'WoW']) == 'finNNalLLly.WoW'\n\n    # Check some edge cases that are easy to work out by hand.\n    assert candidate('_', ['Bb', '91245']) == '_.Bb'\n    assert candidate('Sp', ['671235', 'Bb']) == 'Sp.671235'\n    \n"}, "task_id": "HumanEval/153", "prompt": "\ndef Strongest_Extension(class_name, extensions):\n    \"\"\"You will be given the name of a class (a string) and a list of extensions.\n    The extensions are to be used to load additional classes to the class. The\n    strength of the extension is as follows: Let CAP be the number of the uppercase\n    letters in the extension's name, and let SM be the number of lowercase letters \n    in the extension's name, the strength is given by the fraction CAP - SM. \n    You should find the strongest extension and return a string in this \n    format: ClassName.StrongestExtensionName.\n    If there are two or more extensions with the same strength, you should\n    choose the one that comes first in the list.\n    For example, if you are given \"Slices\" as the class and a list of the\n    extensions: ['SErviNGSliCes', 'Cheese', 'StuFfed'] then you should\n    return 'Slices.SErviNGSliCes' since 'SErviNGSliCes' is the strongest extension \n    (its strength is -1).\n    Example:\n    for Strongest_Extension('my_class', ['AA', 'Be', 'CC']) == 'my_class.AA'\n    \"\"\"\n", "canonical_solution": "    strong = extensions[0]\n    my_val = len([x for x in extensions[0] if x.isalpha() and x.isupper()]) - len([x for x in extensions[0] if x.isalpha() and x.islower()])\n    for s in extensions:\n        val = len([x for x in s if x.isalpha() and x.isupper()]) - len([x for x in s if x.isalpha() and x.islower()])\n        if val > my_val:\n            strong = s\n            my_val = val\n\n    ans = class_name + \".\" + strong\n    return ans\n\n"}
diff --git a/requirements.txt b/requirements.txt
index 859a485f..aedcb879 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-transformers ~= 4.26.1
+transformers @ git+https://github.com/huggingface/transformers@11fd2c773b11c3fcfe0fa25aa4b92db03c83636c
 tree-sitter ~= 0.19.0
 torch ~= 1.12.0
 pytorch-lightning == 1.7.4