From 529edceb3a0c449191f5d47353cd866049e941ca Mon Sep 17 00:00:00 2001
From: yilunzhao <yilun.zhao@yale.edu>
Date: Tue, 25 Apr 2023 16:48:37 -0400
Subject: [PATCH 01/18] re-upload the implementation for llama, alpaca,
 santacoder

---
 README.md                                     |  5 +
 execution/executors.py                        |  6 +-
 .../lightning_modules/models/seq2seq_model.py |  4 +
 .../models/seq2seq_model_util.py              | 25 ++++-
 .../mathqa-8_fixed_mathqa_shots_llama.yaml    | 99 +++++++++++++++++++
 5 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml

diff --git a/README.md b/README.md
index 27fad5e1..fc35d15c 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,11 @@ pip install -r requirements.txt
 export PYTHONPATH=`pwd`
 ```
 
+To run LLAMA-based model, you need to install the development version of `transformers` library:
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
 ## Wandb
 We use Wandb for experiment tracking. Please register ask Ansong for an invitation to the Wandb Yale-LILY team before 
 running experiments. When you are ready to run the exps and log it to the cloud, do the following:
diff --git a/execution/executors.py b/execution/executors.py
index 92c5c15d..a3d9229c 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -314,7 +314,11 @@ def gold_program_len(self, example: Dict[str, Any]) -> int:
 
     @overrides
     def process_output(self, output: str, tokenizer_eos_token: str) -> str:
-        return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].strip()
+        if not tokenizer_eos_token:
+            # for llama-based model
+            return output.lstrip().split("\n\n")[0].strip()
+        else:
+            return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].strip()
 
     @overrides
     def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:
diff --git a/finetuning/lightning_modules/models/seq2seq_model.py b/finetuning/lightning_modules/models/seq2seq_model.py
index 9517b1bb..4dcbb0df 100644
--- a/finetuning/lightning_modules/models/seq2seq_model.py
+++ b/finetuning/lightning_modules/models/seq2seq_model.py
@@ -149,6 +149,10 @@ def generate_and_post_process(self,
             num_beam = 1
             temp = temperature
 
+        # https://github.com/THUDM/ChatGLM-6B/issues/31
+        if "santacoder" in self.transformer_model_name:
+            use_sample = False
+
         generation_results = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=use_sample, 
                                                   max_new_tokens=self.max_gen_len, num_beams=num_beam,
                                                   temperature=temp, num_return_sequences=num_return_sequences,
diff --git a/finetuning/lightning_modules/models/seq2seq_model_util.py b/finetuning/lightning_modules/models/seq2seq_model_util.py
index 4397b150..fa443aa5 100755
--- a/finetuning/lightning_modules/models/seq2seq_model_util.py
+++ b/finetuning/lightning_modules/models/seq2seq_model_util.py
@@ -13,6 +13,7 @@
 from transformers import CodeGenTokenizer, CodeGenForCausalLM, T5Tokenizer
 from transformers import BartTokenizer, BartModel, BartForConditionalGeneration
 from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
+from transformers import LlamaTokenizer, LlamaForCausalLM
 
 from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerFast
 
@@ -138,7 +139,29 @@ def get_model(model_name: str,
 
         if not tokenizer_only:
             model = BartForSequenceClassification.from_pretrained(model_name, num_labels=2)
-                                                    
+    elif "llama" in model_name.lower() or "alpaca" in model_name.lower():
+        tokenizer = LlamaTokenizer.from_pretrained(model_name,
+                                                    additional_special_tokens=additional_special_tokens)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        if not tokenizer_only:
+            model = LlamaForCausalLM.from_pretrained(model_name, 
+                                                    pad_token_id=tokenizer.eos_token_id, 
+                                                    torch_dtype=torch.float16)
+            if len(additional_special_tokens) > 0:
+                model.resize_token_embeddings(len(tokenizer))
+    elif "santacoder" in model_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                    additional_special_tokens=additional_special_tokens)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        if not tokenizer_only:
+            model = AutoModelForCausalLM.from_pretrained(model_name,        
+                                                        pad_token_id=tokenizer.eos_token_id, 
+                                                        torch_dtype=torch.float16,
+                                                        trust_remote_code=True)
+            if len(additional_special_tokens) > 0:
+                model.resize_token_embeddings(len(tokenizer))
     elif model_name.startswith("openai/"):
         engine = model_name.split("/")[-1]
 
diff --git a/finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml b/finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml
new file mode 100644
index 00000000..e8081cde
--- /dev/null
+++ b/finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml
@@ -0,0 +1,99 @@
+seed_everything: 333
+trainer:
+  gpus: 2,3
+  gradient_clip_val: 1.0
+  default_root_dir: &exp_name results/debug-tmp-128
+  # default_root_dir: &exp_name results/gsmath-incoder_6b-few_shot-pass_at_50-train-output_prob-gen_len_256-gsm_shots-split_0
+  val_check_interval: 1.0
+  max_steps: &max_steps 25000
+  # progress_bar_refresh_rate: 1
+  num_sanity_val_steps: 0
+  log_every_n_steps: 1
+  logger+:
+    - class_path: finetuning.lightning_modules.patches.patched_loggers.PatchedWandbLogger
+      init_args:
+        entity: yilunzhao
+        project: unified-codegen
+        save_dir: *exp_name
+        name: *exp_name
+        log_model: False
+        save_code: True
+        offline: False
+        # offline: True
+  callbacks+:
+    - class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
+      init_args:
+        # monitor: exec_acc
+        monitor: pass@100
+        mode: max
+        # filename: '{step}-{exec_acc:.4f}-{exec_rate:.4f}'
+        filename: '{step}-{pass@100:.4f}-{exec_acc:.4f}'
+        save_top_k: 3
+        save_last: True
+    - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+      init_args:
+        logging_interval: step
+    - class_path: pytorch_lightning.callbacks.progress.TQDMProgressBar
+      init_args:
+        refresh_rate: 1
+    # - class_path: pytorch_lightning.callbacks.gpu_stats_monitor.GPUStatsMonitor
+    #   init_args:
+    #     memory_utilization: true
+    #     gpu_utilization: true
+        
+  accelerator: gpu
+  # replace_sampler_ddp: False
+  # https://github.com/PyTorchLightning/pytorch-lightning/issues/8262
+  # strategy: deepspeed_stage_2
+  strategy: deepspeed_stage_3_offload
+  # strategy: ddp_find_unused_parameters_false
+  precision: 16
+  # accumulate_grad_batches: 2
+
+model:
+  class_path: lightning_modules.models.seq2seq_model.Seq2SeqModel
+  init_args:
+    transformer_model_name: &transformer decapoda-research/llama-7b-hf
+    executor_cls: execution.executors.MathExecutor
+    max_gen_len: 256
+    sampling_temp: 0.001
+    # sampling_temp_at_k: 0.8
+    # pass_at_k: 50
+    # load_ckpt_file: results/squall-t5_large-finetuning-cat_eval-pass_100_eval-max_pass_at_k/cot-codegen/2vx58eip/checkpoints/step=534-pass@100=0.8419-exec_acc=0.5593.ckpt
+    # eval_pass_at_k_every_n_epochs: 1
+    # max_generation_batches: 5
+    # gradient_ckpt: true
+    # eval_greedy_search: true
+    # save_raw_generation_results: true
+    # print_eval_every_n_batches: 1
+    optimizer:
+      init_args: 
+        # lr: 5.0e-5
+        lr: 0.0
+        betas: 
+          - 0.9
+          - 0.999
+        eps: 1.0e-8
+        weight_decay: 0.1
+    lr_scheduler:
+      name: linear
+      init_args:
+        num_warmup_steps: 100
+        num_training_steps: *max_steps
+
+data:
+  class_path: lightning_modules.datasets.mathqa_reader.FewShotMathQADataModule
+  init_args:
+    transformer_model_name: *transformer
+    batch_size: 1
+    val_batch_size: 1
+    # train_max_instances: 100
+    val_max_instances: 64
+    # train_set_init_args:
+    #   file_path: data/squall/squall_train_processed.jsonl 
+    val_set_init_args:
+      # prompt_examples: 4
+      file_path: data/mathqa/val_dedup_init_val.jsonl
+      prompt_file: prompt_files/mathqa_non_idiomatic_code_init_val.txt
+
+# clear; export PYTHONPATH=`pwd`; python finetuning/new_trainer.py validate --config finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots.yaml
\ No newline at end of file

From 9dbc8cd46e6d35b04f89617d998328bed63eaa05 Mon Sep 17 00:00:00 2001
From: yilunzhao <yilun.zhao@yale.edu>
Date: Tue, 25 Apr 2023 21:32:18 -0400
Subject: [PATCH 02/18] try to pass CI check

---
 .../mathqa-8_fixed_mathqa_shots_llama.yaml    | 35 ++++++++++---------
 requirements.txt                              |  2 +-
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml b/finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml
index e8081cde..c93de965 100644
--- a/finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml
+++ b/finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml
@@ -1,6 +1,6 @@
 seed_everything: 333
 trainer:
-  gpus: 2,3
+  gpus: 2
   gradient_clip_val: 1.0
   default_root_dir: &exp_name results/debug-tmp-128
   # default_root_dir: &exp_name results/gsmath-incoder_6b-few_shot-pass_at_50-train-output_prob-gen_len_256-gsm_shots-split_0
@@ -45,8 +45,8 @@ trainer:
   # replace_sampler_ddp: False
   # https://github.com/PyTorchLightning/pytorch-lightning/issues/8262
   # strategy: deepspeed_stage_2
-  strategy: deepspeed_stage_3_offload
-  # strategy: ddp_find_unused_parameters_false
+  # strategy: deepspeed_stage_2
+  strategy: ddp_find_unused_parameters_false
   precision: 16
   # accumulate_grad_batches: 2
 
@@ -59,12 +59,9 @@ model:
     sampling_temp: 0.001
     # sampling_temp_at_k: 0.8
     # pass_at_k: 50
-    # load_ckpt_file: results/squall-t5_large-finetuning-cat_eval-pass_100_eval-max_pass_at_k/cot-codegen/2vx58eip/checkpoints/step=534-pass@100=0.8419-exec_acc=0.5593.ckpt
-    # eval_pass_at_k_every_n_epochs: 1
     # max_generation_batches: 5
-    # gradient_ckpt: true
-    # eval_greedy_search: true
-    # save_raw_generation_results: true
+    gradient_ckpt: false
+    save_raw_generation_results: true
     # print_eval_every_n_batches: 1
     optimizer:
       init_args: 
@@ -82,18 +79,22 @@ model:
         num_training_steps: *max_steps
 
 data:
-  class_path: lightning_modules.datasets.mathqa_reader.FewShotMathQADataModule
+  class_path: lightning_modules.datasets.base_datamodule.FewShotNL2CodeDataModule
   init_args:
     transformer_model_name: *transformer
+    dataset_cls: FewShotMathQADataset
     batch_size: 1
-    val_batch_size: 1
-    # train_max_instances: 100
-    val_max_instances: 64
-    # train_set_init_args:
-    #   file_path: data/squall/squall_train_processed.jsonl 
+    val_batch_size: 4
+    ## prompting settings
+    prompting_init_args:
+      exemplar_file_path: prompt_files/mathqa-non_idiomatic_code-annotated-8_exemplars.jsonl
+      num_exemplars: 8
+      fixed_exemplars: true
+      exemplar_selection_method: first
+      add_instruction: true
+      use_chat_format: false
+    # val_max_instances: 64
     val_set_init_args:
-      # prompt_examples: 4
       file_path: data/mathqa/val_dedup_init_val.jsonl
-      prompt_file: prompt_files/mathqa_non_idiomatic_code_init_val.txt
 
-# clear; export PYTHONPATH=`pwd`; python finetuning/new_trainer.py validate --config finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots.yaml
\ No newline at end of file
+# clear; export PYTHONPATH=`pwd`; python finetuning/new_trainer.py validate --config finetuning/training_configs/few_shot/mathqa-8_fixed_mathqa_shots_llama.yaml
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 859a485f..aedcb879 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-transformers ~= 4.26.1
+transformers @ git+https://github.com/huggingface/transformers@11fd2c773b11c3fcfe0fa25aa4b92db03c83636c
 tree-sitter ~= 0.19.0
 torch ~= 1.12.0
 pytorch-lightning == 1.7.4

From ce36f13a7ac9bdc72afcb5477a70cf1a1b0df016 Mon Sep 17 00:00:00 2001
From: yilunzhao <yilun.zhao@yale.edu>
Date: Thu, 27 Apr 2023 16:39:28 -0400
Subject: [PATCH 03/18] modify process_output function in exectutor.py to
 handle the case that llama-based model uses empty string as
 tokenizer_eos_token

---
 execution/executors.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/execution/executors.py b/execution/executors.py
index a3d9229c..144cf887 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -168,7 +168,11 @@ def gold_program_len(self, example: Dict[str, Any]) -> int:
 
     @overrides
     def process_output(self, output: str, tokenizer_eos_token: str) -> str:
-        return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].split(";")[0].strip()
+        if not tokenizer_eos_token:
+            # for llama-based model
+            return output.lstrip().split()[0].split("\n\n")[0].split(";")[0].strip()
+        else:
+            return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].split(";")[0].strip()
 
     @overrides
     def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:

From 60fd89bc827340d081026dc34658b5d6202c1f1c Mon Sep 17 00:00:00 2001
From: yilunzhao <yilun.zhao@yale.edu>
Date: Fri, 5 May 2023 16:03:47 -0400
Subject: [PATCH 04/18] fix error related to llama eos_token

---
 execution/executors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/execution/executors.py b/execution/executors.py
index 144cf887..e290b395 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -170,7 +170,7 @@ def gold_program_len(self, example: Dict[str, Any]) -> int:
     def process_output(self, output: str, tokenizer_eos_token: str) -> str:
         if not tokenizer_eos_token:
             # for llama-based model
-            return output.lstrip().split()[0].split("\n\n")[0].split(";")[0].strip()
+            return output.lstrip().split("\n\n")[0].split(";")[0].strip()
         else:
             return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].split(";")[0].strip()
 

From 83546d97a51edf7d0951badc08408f6d0f8687ff Mon Sep 17 00:00:00 2001
From: yilunzhao <yilun.zhao@yale.edu>
Date: Sat, 6 May 2023 10:53:49 -0400
Subject: [PATCH 05/18] add starcoder, gpt-neox-20b; test gpt-j-6b

---
 finetuning/lightning_modules/models/seq2seq_model.py |  2 +-
 .../lightning_modules/models/seq2seq_model_util.py   | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/finetuning/lightning_modules/models/seq2seq_model.py b/finetuning/lightning_modules/models/seq2seq_model.py
index 97f327b9..26fd2cef 100644
--- a/finetuning/lightning_modules/models/seq2seq_model.py
+++ b/finetuning/lightning_modules/models/seq2seq_model.py
@@ -156,7 +156,7 @@ def generate_and_post_process(self,
             temp = temperature
 
         # https://github.com/THUDM/ChatGLM-6B/issues/31
-        if "santacoder" in self.transformer_model_name:
+        if "santacoder" in self.transformer_model_name or "gpt-neox-20b" in self.transformer_model_name:
             use_sample = False
 
         generation_results = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=use_sample, 
diff --git a/finetuning/lightning_modules/models/seq2seq_model_util.py b/finetuning/lightning_modules/models/seq2seq_model_util.py
index 41623601..996b9b2c 100755
--- a/finetuning/lightning_modules/models/seq2seq_model_util.py
+++ b/finetuning/lightning_modules/models/seq2seq_model_util.py
@@ -56,7 +56,7 @@ def get_model(model_name: str,
             model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
-    elif model_name == "EleutherAI/gpt-j-6B":
+    elif model_name == "EleutherAI/gpt-j-6b":
         tokenizer = GPT2Tokenizer.from_pretrained(model_name)
         tokenizer.pad_token = tokenizer.eos_token
 
@@ -65,6 +65,14 @@ def get_model(model_name: str,
                                                         gradient_checkpointing=gradient_ckpt, use_cache=not gradient_ckpt)
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
+    elif model_name == "EleutherAI/gpt-neox-20b":
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        if not tokenizer_only:
+            model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
+            if len(additional_special_tokens) > 0:
+                model.resize_token_embeddings(len(tokenizer))
     elif model_name in ["EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-125M", "EleutherAI/gpt-neo-2.7B"]:
         tokenizer = GPT2Tokenizer.from_pretrained(model_name, additional_special_tokens=additional_special_tokens)
         tokenizer.pad_token = tokenizer.eos_token
@@ -148,7 +156,7 @@ def get_model(model_name: str,
                                                     torch_dtype=torch.float16)
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
-    elif "santacoder" in model_name:
+    elif model_name in ["bigcode/starcoder", "bigcode/santacoder"]:
         tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                     additional_special_tokens=additional_special_tokens)
         tokenizer.pad_token = tokenizer.eos_token

From 86d3bd57ea3691eddfabdbf4a56615edcb0d17d5 Mon Sep 17 00:00:00 2001
From: yilunzhao <yilun.zhao@yale.edu>
Date: Fri, 19 May 2023 16:24:36 -0400
Subject: [PATCH 06/18] archieve code for NeurIPS exps; add gpt-4, pythia,
 replit, dolly, starchat, etc

---
 .../lightning_modules/models/openai_model.py  |  8 ++---
 .../lightning_modules/models/seq2seq_model.py |  6 ++--
 .../models/seq2seq_model_util.py              | 32 +++++++++++++++++--
 3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/finetuning/lightning_modules/models/openai_model.py b/finetuning/lightning_modules/models/openai_model.py
index 8c8e7cf3..000322ae 100644
--- a/finetuning/lightning_modules/models/openai_model.py
+++ b/finetuning/lightning_modules/models/openai_model.py
@@ -80,10 +80,10 @@ def prune_none_args(**kwargs):
                                             temperature=temperature, top_p=top_p, n=n, best_of=best_of, 
                                             stop=stop, **kwargs)
             
-            if engine.startswith("gpt-3.5-turbo"):
+            if engine.startswith("gpt-3.5-turbo") or engine.startswith("gpt-4"):
                 non_none_args.pop("prompt")
                 non_none_args.pop("engine")
-                assert len(prompts) == 1, "gpt-3.5-turbo only supports one prompt at a time"
+                assert len(prompts) == 1, "gpt-3.5-turbo or gpt-4 only supports one prompt at a time"
                 if use_chat_format:
                     non_none_args["messages"] = prompt_to_chatgpt_format(prompts[0])
                 else:
@@ -115,7 +115,7 @@ def prune_none_args(**kwargs):
             time.sleep(60 * 5)
 
     # get the text from the returned results and slice the completions to input_n * completion_n
-    if engine.startswith("gpt-3.5-turbo"):
+    if engine.startswith("gpt-3.5-turbo") or engine.startswith("gpt-4"):
         completion_texts = [x['message']['content'] for x in completion.choices]
     else:
         completion_texts = [x.text for x in completion.choices]
@@ -141,7 +141,7 @@ def __init__(self,
                  ) -> None:
         SUPPORTED_OPENAI_MODELS = ["code-davinci-002", "code-cushman-002", 
                                    "code-cushman-001", "code-davinci-001", 
-                                   "gpt-3.5-turbo"]
+                                   "gpt-3.5-turbo", "text-davinci-003", "text-davinci-002","gpt-4"]
         assert engine in SUPPORTED_OPENAI_MODELS, f"OpenAIModel only supports {SUPPORTED_OPENAI_MODELS}"
 
         self.engine = engine
diff --git a/finetuning/lightning_modules/models/seq2seq_model.py b/finetuning/lightning_modules/models/seq2seq_model.py
index 26fd2cef..885724f4 100644
--- a/finetuning/lightning_modules/models/seq2seq_model.py
+++ b/finetuning/lightning_modules/models/seq2seq_model.py
@@ -75,9 +75,9 @@ def __init__(self,
         # We only instantiate this when we need it.
         self.transformer_model_name = transformer_model_name
         if "openai" in self.transformer_model_name:
-            if self.transformer_model_name.startswith("openai/gpt-3.5-turbo"):
+            if self.transformer_model_name.startswith("openai/gpt-3.5-turbo") or self.transformer_model_name.startswith("openai/gpt-4"):
                 if self.save_raw_generation_results:
-                    print("get_raw_generation_results is not supported for gpt-3.5-turbo, set to False instead")
+                    print("get_raw_generation_results is not supported for gpt-3.5-turbo and gpt-4, set to False instead")
                 self.save_raw_generation_results = False
             transformer_model_init_args["save_raw_generation_results"] = self.save_raw_generation_results
             transformer_model_init_args["use_chat_format"] = self.use_chat_format
@@ -156,7 +156,7 @@ def generate_and_post_process(self,
             temp = temperature
 
         # https://github.com/THUDM/ChatGLM-6B/issues/31
-        if "santacoder" in self.transformer_model_name or "gpt-neox-20b" in self.transformer_model_name:
+        if "santacoder" in self.transformer_model_name or "gpt-neox-20b" in self.transformer_model_name or "replit" in self.transformer_model_name:
             use_sample = False
 
         generation_results = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=use_sample, 
diff --git a/finetuning/lightning_modules/models/seq2seq_model_util.py b/finetuning/lightning_modules/models/seq2seq_model_util.py
index 996b9b2c..ad13c97b 100755
--- a/finetuning/lightning_modules/models/seq2seq_model_util.py
+++ b/finetuning/lightning_modules/models/seq2seq_model_util.py
@@ -65,7 +65,7 @@ def get_model(model_name: str,
                                                         gradient_checkpointing=gradient_ckpt, use_cache=not gradient_ckpt)
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
-    elif model_name == "EleutherAI/gpt-neox-20b":
+    elif model_name in ["EleutherAI/gpt-neox-20b", "EleutherAI/pythia-1.4b-deduped", "EleutherAI/pythia-6.9b-deduped", "EleutherAI/pythia-12b-deduped", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]:
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         tokenizer.pad_token = tokenizer.eos_token
 
@@ -156,11 +156,37 @@ def get_model(model_name: str,
                                                     torch_dtype=torch.float16)
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
-    elif model_name in ["bigcode/starcoder", "bigcode/santacoder"]:
+    elif model_name == "bigcode/santacoder":
         tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                     additional_special_tokens=additional_special_tokens)
         tokenizer.pad_token = tokenizer.eos_token
 
+        if not tokenizer_only:
+            model = AutoModelForCausalLM.from_pretrained(model_name,        
+                                                        pad_token_id=tokenizer.eos_token_id, 
+                                                        torch_dtype=torch.float32,
+                                                        trust_remote_code=True, 
+                                                        )
+            if len(additional_special_tokens) > 0:
+                model.resize_token_embeddings(len(tokenizer))
+    elif model_name in ["bigcode/starcoder", "HuggingFaceH4/starchat-alpha"]:
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                    additional_special_tokens=additional_special_tokens)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        if not tokenizer_only:
+            model = AutoModelForCausalLM.from_pretrained(model_name,        
+                                                        pad_token_id=tokenizer.eos_token_id, 
+                                                        torch_dtype=torch.float16,
+                                                        trust_remote_code=True)
+            if len(additional_special_tokens) > 0:
+                model.resize_token_embeddings(len(tokenizer))
+    elif model_name == "replit/replit-code-v1-3b":
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                    additional_special_tokens=additional_special_tokens,
+                                                    trust_remote_code=True)
+        tokenizer.pad_token = tokenizer.eos_token
+
         if not tokenizer_only:
             model = AutoModelForCausalLM.from_pretrained(model_name,        
                                                         pad_token_id=tokenizer.eos_token_id, 
@@ -175,7 +201,7 @@ def get_model(model_name: str,
         tokenizer.pad_token = tokenizer.eos_token
 
         # to accomandate the length of openai models and the prompt
-        if engine in ["code-davinci-002"]:
+        if engine in ["code-davinci-002", "gpt-4"]:
             model_length = 8001
         elif engine in ["code-cushman-001", "code-cushman-002"]:
             model_length = 1024

From 1fd0828c436ace73b7d3ed52132efc261710da0c Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Sun, 9 Jul 2023 15:27:21 -0400
Subject: [PATCH 07/18] merged with llm_implementation

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 2fc06ea7..2b10e219 100644
--- a/.gitignore
+++ b/.gitignore
@@ -135,3 +135,7 @@ debug-tmp/
 wandb/
 results/
 .vscode/
+
+# defined by rui
+DS_1000/
+.DS_Store

From a80826e2a02d6526db5620d61eabebfb08ed8225 Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Tue, 11 Jul 2023 11:41:00 -0400
Subject: [PATCH 08/18] humaneval now works on gpt-neo-125M

---
 execution/executors.py                        | 43 ++++++++++++-
 .../datasets/base_datamodule.py               |  1 +
 .../datasets/humaneval_reader.py              | 35 +++++++++++
 .../training_configs/few_shot/humaneval.yaml  | 62 +++++++++++++++++++
 preprocessing/preprocess_humaneval.py         | 34 ++++++++++
 prompt_files/humaneval-8_exemplars.jsonl      |  8 +++
 6 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 finetuning/lightning_modules/datasets/humaneval_reader.py
 create mode 100644 finetuning/training_configs/few_shot/humaneval.yaml
 create mode 100644 preprocessing/preprocess_humaneval.py
 create mode 100644 prompt_files/humaneval-8_exemplars.jsonl

diff --git a/execution/executors.py b/execution/executors.py
index e290b395..23ecea52 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -14,6 +14,9 @@
 from execution.safe_execution_util import execute
 from execution.program_tracing import get_function_final_state
 
+from human_eval.execution import check_correctness
+
+
 """
 From the models' perspective, the model would only want two things: 
     1) if the execution result is right; 
@@ -363,4 +366,42 @@ def real_exec_program(cls, program: str, example: Dict[str, Any]) -> Tuple[int,
             executed_answer = "ERROR: program failed to execute"
             exec_match = -1
 
-        return exec_match, executed_answer
\ No newline at end of file
+        return exec_match, executed_answer
+    
+
+class HumanEvalExecutor(BaseExecutor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @overrides
+    def cache_key_func(self, program: str, example: Dict[str, Any]) -> str:
+        return example["prompt"] + " | "  +  program
+
+    @overrides
+    def program_len(self, program: str) -> int:
+        return python_program_len(program)
+
+    @overrides
+    def gold_program_len(self, example: Dict[str, Any]) -> int:
+        return self.program_len(example["canonical_solution"])
+
+    # TODO: modify this later based on generated programs
+    @overrides
+    def process_output(self, output: str, tokenizer_eos_token: str) -> str:
+        return output.split("### Task End ###")[0]
+
+    @overrides
+    def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:
+        return (program_dict_1['exec_result'] and (program_dict_1['exec_result'] == program_dict_2['exec_result']))
+
+    @classmethod
+    def real_exec_program(cls, program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
+        eval_dict = example
+        metadata = eval_dict.pop('metadata')
+        eval_dict.update(metadata)
+
+        result_dict = check_correctness(eval_dict, program, timeout=5)
+        exec_match = result_dict['passed']
+        exec_result = result_dict['result']
+
+        return exec_match, exec_result
diff --git a/finetuning/lightning_modules/datasets/base_datamodule.py b/finetuning/lightning_modules/datasets/base_datamodule.py
index d2cd9de9..2343cef3 100644
--- a/finetuning/lightning_modules/datasets/base_datamodule.py
+++ b/finetuning/lightning_modules/datasets/base_datamodule.py
@@ -10,6 +10,7 @@
 from finetuning.lightning_modules.datasets.spider_reader import FewShotSpiderDataset, SpiderDataset
 from finetuning.lightning_modules.datasets.mathqa_reader import FewShotMathQADataset, MathQADataset
 from finetuning.lightning_modules.datasets.mbpp_reader import FewShotMBPPDataset
+from finetuning.lightning_modules.datasets.humaneval_reader import FewShotHumanEvalDataset
 
 from finetuning.lightning_modules.models.seq2seq_model_util import is_model_gpt_style
 from finetuning.lightning_modules.models.seq2seq_model_util import left_pad_sequences, right_pad_sequences
diff --git a/finetuning/lightning_modules/datasets/humaneval_reader.py b/finetuning/lightning_modules/datasets/humaneval_reader.py
new file mode 100644
index 00000000..9328f782
--- /dev/null
+++ b/finetuning/lightning_modules/datasets/humaneval_reader.py
@@ -0,0 +1,35 @@
+import re
+import os
+import pandas as pd
+
+from overrides import overrides
+
+from typing import Dict, Iterable, List, Any, Optional, Union, Tuple
+
+from finetuning.lightning_modules.datasets.base_reader import NL2CodeDataset, FewShotNL2CodeDataset
+from execution.program_tracing import assertion_to_test
+
+from human_eval.data import write_jsonl, read_problems
+
+
+class FewShotHumanEvalDataset(FewShotNL2CodeDataset):
+
+    instruction: str = "## Given the function header description, complete the python function."
+    example_io_sep: str = "\n"
+
+    @overrides
+    def get_test_instance(self, example: Dict[str, Any]) -> List[Dict[str, Any]]:
+        context = self.get_prompt_for_example(example)
+
+        return [self.get_example_dict(example, context, train_mode=False)]
+
+    # @overrides
+    def promptify_example(self, example: Dict[str, Any], add_code: bool = True, 
+                          add_assertion_n: int = 0, test_input_only: bool = False) -> Tuple[str, str]:
+
+        header = example["prompt"]
+
+        if add_code:
+            return f'### Task Start ###\n{header}', f'{example["canonical_solution"]}\n### Task End ###'
+        else:
+            return f'### Task Start ###\n{header}', ''
\ No newline at end of file
diff --git a/finetuning/training_configs/few_shot/humaneval.yaml b/finetuning/training_configs/few_shot/humaneval.yaml
new file mode 100644
index 00000000..63e1308e
--- /dev/null
+++ b/finetuning/training_configs/few_shot/humaneval.yaml
@@ -0,0 +1,62 @@
+seed_everything: 333
+trainer:
+  default_root_dir: &exp_name results/debug-tmp
+  # progress_bar_refresh_rate: 1
+  num_sanity_val_steps: 0
+  log_every_n_steps: 1
+  logger+:
+    - class_path: finetuning.lightning_modules.patches.patched_loggers.PatchedWandbLogger
+      init_args:
+        entity: yale-lily
+        project: unified-codegen
+        save_dir: *exp_name
+        name: *exp_name
+        log_model: False
+        save_code: True
+        offline: False
+        # offline: True
+  callbacks+:
+    - class_path: pytorch_lightning.callbacks.progress.TQDMProgressBar
+      init_args:
+        refresh_rate: 1
+        
+  accelerator: gpu
+  devices: 2
+  # strategy: deepspeed_stage_2
+  strategy: ddp_find_unused_parameters_false
+  precision: 16
+
+model:
+  class_path: lightning_modules.models.seq2seq_model.Seq2SeqModel
+  init_args:
+    transformer_model_name: default-will-cause-error
+    executor_cls: execution.executors.HumanEvalExecutor
+    max_gen_len: 256
+    sampling_temp: 0.001
+    # sampling_temp_at_k: 0.8
+    # pass_at_k: 50
+    # max_generation_batches: 5
+    gradient_ckpt: false
+    save_raw_generation_results: true
+    # print_eval_every_n_batches: 1
+
+data:
+  class_path: lightning_modules.datasets.base_datamodule.FewShotNL2CodeDataModule
+  init_args:
+    transformer_model_name: default-will-cause-error
+    dataset_cls: FewShotHumanEvalDataset
+    batch_size: 1
+    val_batch_size: 4
+    ## prompting settings
+    prompting_init_args:
+      exemplar_file_path: prompt_files/humaneval-8_exemplars.jsonl
+      num_exemplars: 3
+      fixed_exemplars: true
+      exemplar_selection_method: first
+      add_instruction: true
+      use_chat_format: false
+    additional_prompt_func_args:
+      add_assertion_n: 1
+      test_input_only: false
+    val_set_init_args:
+      file_path: data/humaneval/humaneval.jsonl
\ No newline at end of file
diff --git a/preprocessing/preprocess_humaneval.py b/preprocessing/preprocess_humaneval.py
new file mode 100644
index 00000000..6e39e1d1
--- /dev/null
+++ b/preprocessing/preprocess_humaneval.py
@@ -0,0 +1,34 @@
+import json
+import os
+import random
+from human_eval.data import write_jsonl, read_problems
+
+file_path = os.getcwd() + "/data/humaneval/humaneval.jsonl"
+os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+problems = read_problems()
+result_jsonl = []
+metadata_keys = ['task_id', 'entry_point', 'test']
+for task_id in problems:
+    new_dict = {}
+    metadata = {}
+    for key in metadata_keys:
+        metadata[key] = problems[task_id][key]
+    new_dict['metadata'] = metadata
+    new_dict['prompt'] = problems[task_id]['prompt']
+    new_dict['canonical_solution'] = problems[task_id]['canonical_solution']
+    result_jsonl.append(new_dict)
+
+write_jsonl(file_path, result_jsonl)
+
+selected_problems = []
+with open(file_path, 'r') as file:
+    for line in file:
+        json_obj = json.loads(line)
+        selected_problems.append(json_obj)
+selected_problems = random.sample(selected_problems, 8)
+
+file_path = os.getcwd() + "/prompt_files/humaneval-8_examplars.jsonl"
+os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+write_jsonl(file_path, selected_problems)
diff --git a/prompt_files/humaneval-8_exemplars.jsonl b/prompt_files/humaneval-8_exemplars.jsonl
new file mode 100644
index 00000000..726cface
--- /dev/null
+++ b/prompt_files/humaneval-8_exemplars.jsonl
@@ -0,0 +1,8 @@
+{"metadata": {"entry_point": "fib4", "test": "\n\nMETADATA = {}\n\n\ndef check(candidate):\n    assert candidate(5) == 4\n    assert candidate(8) == 28\n    assert candidate(10) == 104\n    assert candidate(12) == 386\n\n"}, "task_id": "HumanEval/46", "prompt": "\n\ndef fib4(n: int):\n    \"\"\"The Fib4 number sequence is a sequence similar to the Fibbonacci sequnece that's defined as follows:\n    fib4(0) -> 0\n    fib4(1) -> 0\n    fib4(2) -> 2\n    fib4(3) -> 0\n    fib4(n) -> fib4(n-1) + fib4(n-2) + fib4(n-3) + fib4(n-4).\n    Please write a function to efficiently compute the n-th element of the fib4 number sequence.  Do not use recursion.\n    >>> fib4(5)\n    4\n    >>> fib4(6)\n    8\n    >>> fib4(7)\n    14\n    \"\"\"\n", "canonical_solution": "    results = [0, 0, 2, 0]\n    if n < 4:\n        return results[n]\n\n    for _ in range(4, n + 1):\n        results.append(results[-1] + results[-2] + results[-3] + results[-4])\n        results.pop(0)\n\n    return results[-1]\n"}
+{"metadata": {"entry_point": "max_fill", "test": "def check(candidate):\n\n\n    # Check some simple cases\n    assert True, \"This prints if this assert fails 1 (good for debugging!)\"\n    assert candidate([[0,0,1,0], [0,1,0,0], [1,1,1,1]], 1) == 6, \"Error\"\n    assert candidate([[0,0,1,1], [0,0,0,0], [1,1,1,1], [0,1,1,1]], 2) == 5, \"Error\"\n    assert candidate([[0,0,0], [0,0,0]], 5) == 0, \"Error\"\n\n    # Check some edge cases that are easy to work out by hand.\n    assert True, \"This prints if this assert fails 2 (also good for debugging!)\"\n    assert candidate([[1,1,1,1], [1,1,1,1]], 2) == 4, \"Error\"\n    assert candidate([[1,1,1,1], [1,1,1,1]], 9) == 2, \"Error\"\n\n"}, "task_id": "HumanEval/115", "prompt": "\ndef max_fill(grid, capacity):\n    import math\n    \"\"\"\n    You are given a rectangular grid of wells. Each row represents a single well,\n    and each 1 in a row represents a single unit of water.\n    Each well has a corresponding bucket that can be used to extract water from it, \n    and all buckets have the same capacity.\n    Your task is to use the buckets to empty the wells.\n    Output the number of times you need to lower the buckets.\n\n    Example 1:\n        Input: \n            grid : [[0,0,1,0], [0,1,0,0], [1,1,1,1]]\n            bucket_capacity : 1\n        Output: 6\n\n    Example 2:\n        Input: \n            grid : [[0,0,1,1], [0,0,0,0], [1,1,1,1], [0,1,1,1]]\n            bucket_capacity : 2\n        Output: 5\n    \n    Example 3:\n        Input: \n            grid : [[0,0,0], [0,0,0]]\n            bucket_capacity : 5\n        Output: 0\n\n    Constraints:\n        * all wells have the same length\n        * 1 <= grid.length <= 10^2\n        * 1 <= grid[:,1].length <= 10^2\n        * grid[i][j] -> 0 | 1\n        * 1 <= capacity <= 10\n    \"\"\"\n", "canonical_solution": "    return sum([math.ceil(sum(arr)/capacity) for arr in grid])\n"}
+{"metadata": {"entry_point": "separate_paren_groups", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert candidate('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert candidate('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n"}, "task_id": "HumanEval/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n"}
+{"metadata": {"entry_point": "filter_integers", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == []\n    assert candidate([4, {}, [], 23.2, 9, 'adasd']) == [4, 9]\n    assert candidate([3, 'c', 3, 3, 'a', 'b']) == [3, 3, 3]\n"}, "task_id": "HumanEval/22", "prompt": "from typing import List, Any\n\n\ndef filter_integers(values: List[Any]) -> List[int]:\n    \"\"\" Filter given list of any python values only for integers\n    >>> filter_integers(['a', 3.14, 5])\n    [5]\n    >>> filter_integers([1, 2, 3, 'abc', {}, []])\n    [1, 2, 3]\n    \"\"\"\n", "canonical_solution": "    return [x for x in values if isinstance(x, int)]\n"}
+{"metadata": {"entry_point": "encode", "test": "def check(candidate):\n\n    # Check some simple cases\n    assert candidate('TEST') == 'tgst', \"This prints if this assert fails 1 (good for debugging!)\"\n    assert candidate('Mudasir') == 'mWDCSKR', \"This prints if this assert fails 2 (good for debugging!)\"\n    assert candidate('YES') == 'ygs', \"This prints if this assert fails 3 (good for debugging!)\"\n    \n    # Check some edge cases that are easy to work out by hand.\n    assert candidate('This is a message') == 'tHKS KS C MGSSCGG', \"This prints if this assert fails 2 (also good for debugging!)\"\n    assert candidate(\"I DoNt KnOw WhAt tO WrItE\") == 'k dQnT kNqW wHcT Tq wRkTg', \"This prints if this assert fails 2 (also good for debugging!)\"\n\n"}, "task_id": "HumanEval/93", "prompt": "\ndef encode(message):\n    \"\"\"\n    Write a function that takes a message, and encodes in such a \n    way that it swaps case of all letters, replaces all vowels in \n    the message with the letter that appears 2 places ahead of that \n    vowel in the english alphabet. \n    Assume only letters. \n    \n    Examples:\n    >>> encode('test')\n    'TGST'\n    >>> encode('This is a message')\n    'tHKS KS C MGSSCGG'\n    \"\"\"\n", "canonical_solution": "    vowels = \"aeiouAEIOU\"\n    vowels_replace = dict([(i, chr(ord(i) + 2)) for i in vowels])\n    message = message.swapcase()\n    return ''.join([vowels_replace[i] if i in vowels else i for i in message])\n"}
+{"metadata": {"entry_point": "valid_date", "test": "def check(candidate):\n\n    # Check some simple cases\n    assert candidate('03-11-2000') == True\n\n    assert candidate('15-01-2012') == False\n\n    assert candidate('04-0-2040') == False\n\n    assert candidate('06-04-2020') == True\n\n    assert candidate('01-01-2007') == True\n\n    assert candidate('03-32-2011') == False\n\n    assert candidate('') == False\n\n    assert candidate('04-31-3000') == False\n\n    assert candidate('06-06-2005') == True\n\n    assert candidate('21-31-2000') == False\n\n    assert candidate('04-12-2003') == True\n\n    assert candidate('04122003') == False\n\n    assert candidate('20030412') == False\n\n    assert candidate('2003-04') == False\n\n    assert candidate('2003-04-12') == False\n\n    assert candidate('04-2003') == False\n"}, "task_id": "HumanEval/124", "prompt": "\ndef valid_date(date):\n    \"\"\"You have to write a function which validates a given date string and\n    returns True if the date is valid otherwise False.\n    The date is valid if all of the following rules are satisfied:\n    1. The date string is not empty.\n    2. The number of days is not less than 1 or higher than 31 days for months 1,3,5,7,8,10,12. And the number of days is not less than 1 or higher than 30 days for months 4,6,9,11. And, the number of days is not less than 1 or higher than 29 for the month 2.\n    3. The months should not be less than 1 or higher than 12.\n    4. The date should be in the format: mm-dd-yyyy\n\n    for example: \n    valid_date('03-11-2000') => True\n\n    valid_date('15-01-2012') => False\n\n    valid_date('04-0-2040') => False\n\n    valid_date('06-04-2020') => True\n\n    valid_date('06/04/2020') => False\n    \"\"\"\n", "canonical_solution": "    try:\n        date = date.strip()\n        month, day, year = date.split('-')\n        month, day, year = int(month), int(day), int(year)\n        if month < 1 or month > 12:\n            return False\n        if month in [1,3,5,7,8,10,12] and day < 1 or day > 31:\n            return False\n        if month in [4,6,9,11] and day < 1 or day > 30:\n            return False\n        if month == 2 and day < 1 or day > 29:\n            return False\n    except:\n        return False\n\n    return True\n"}
+{"metadata": {"entry_point": "rounded_avg", "test": "def check(candidate):\n\n    # Check some simple cases\n    assert candidate(1, 5) == \"0b11\"\n    assert candidate(7, 13) == \"0b1010\"\n    assert candidate(964,977) == \"0b1111001010\"\n    assert candidate(996,997) == \"0b1111100100\"\n    assert candidate(560,851) == \"0b1011000010\"\n    assert candidate(185,546) == \"0b101101110\"\n    assert candidate(362,496) == \"0b110101101\"\n    assert candidate(350,902) == \"0b1001110010\"\n    assert candidate(197,233) == \"0b11010111\"\n\n\n    # Check some edge cases that are easy to work out by hand.\n    assert candidate(7, 5) == -1\n    assert candidate(5, 1) == -1\n    assert candidate(5, 5) == \"0b101\"\n\n"}, "task_id": "HumanEval/103", "prompt": "\ndef rounded_avg(n, m):\n    \"\"\"You are given two positive integers n and m, and your task is to compute the\n    average of the integers from n through m (including n and m). \n    Round the answer to the nearest integer and convert that to binary.\n    If n is greater than m, return -1.\n    Example:\n    rounded_avg(1, 5) => \"0b11\"\n    rounded_avg(7, 5) => -1\n    rounded_avg(10, 20) => \"0b1111\"\n    rounded_avg(20, 33) => \"0b11010\"\n    \"\"\"\n", "canonical_solution": "    if m < n:\n        return -1\n    summation = 0\n    for i in range(n, m+1):\n        summation += i\n    return bin(round(summation/(m - n + 1)))\n"}
+{"metadata": {"entry_point": "Strongest_Extension", "test": "def check(candidate):\n\n    # Check some simple cases\n    assert candidate('Watashi', ['tEN', 'niNE', 'eIGHt8OKe']) == 'Watashi.eIGHt8OKe'\n    assert candidate('Boku123', ['nani', 'NazeDa', 'YEs.WeCaNe', '32145tggg']) == 'Boku123.YEs.WeCaNe'\n    assert candidate('__YESIMHERE', ['t', 'eMptY', 'nothing', 'zeR00', 'NuLl__', '123NoooneB321']) == '__YESIMHERE.NuLl__'\n    assert candidate('K', ['Ta', 'TAR', 't234An', 'cosSo']) == 'K.TAR'\n    assert candidate('__HAHA', ['Tab', '123', '781345', '-_-']) == '__HAHA.123'\n    assert candidate('YameRore', ['HhAas', 'okIWILL123', 'WorkOut', 'Fails', '-_-']) == 'YameRore.okIWILL123'\n    assert candidate('finNNalLLly', ['Die', 'NowW', 'Wow', 'WoW']) == 'finNNalLLly.WoW'\n\n    # Check some edge cases that are easy to work out by hand.\n    assert candidate('_', ['Bb', '91245']) == '_.Bb'\n    assert candidate('Sp', ['671235', 'Bb']) == 'Sp.671235'\n    \n"}, "task_id": "HumanEval/153", "prompt": "\ndef Strongest_Extension(class_name, extensions):\n    \"\"\"You will be given the name of a class (a string) and a list of extensions.\n    The extensions are to be used to load additional classes to the class. The\n    strength of the extension is as follows: Let CAP be the number of the uppercase\n    letters in the extension's name, and let SM be the number of lowercase letters \n    in the extension's name, the strength is given by the fraction CAP - SM. \n    You should find the strongest extension and return a string in this \n    format: ClassName.StrongestExtensionName.\n    If there are two or more extensions with the same strength, you should\n    choose the one that comes first in the list.\n    For example, if you are given \"Slices\" as the class and a list of the\n    extensions: ['SErviNGSliCes', 'Cheese', 'StuFfed'] then you should\n    return 'Slices.SErviNGSliCes' since 'SErviNGSliCes' is the strongest extension \n    (its strength is -1).\n    Example:\n    for Strongest_Extension('my_class', ['AA', 'Be', 'CC']) == 'my_class.AA'\n    \"\"\"\n", "canonical_solution": "    strong = extensions[0]\n    my_val = len([x for x in extensions[0] if x.isalpha() and x.isupper()]) - len([x for x in extensions[0] if x.isalpha() and x.islower()])\n    for s in extensions:\n        val = len([x for x in s if x.isalpha() and x.isupper()]) - len([x for x in s if x.isalpha() and x.islower()])\n        if val > my_val:\n            strong = s\n            my_val = val\n\n    ans = class_name + \".\" + strong\n    return ans\n\n"}

From ec58177cc7c486524380c854bbb19b8980439da7 Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Tue, 11 Jul 2023 11:53:05 -0400
Subject: [PATCH 09/18] updated readme

---
 README.md | 65 +++++--------------------------------------------------
 1 file changed, 5 insertions(+), 60 deletions(-)

diff --git a/README.md b/README.md
index fc35d15c..4d06372f 100644
--- a/README.md
+++ b/README.md
@@ -1,66 +1,11 @@
-# NLP4Code
-Repository for the NLP4Code project at the LILY lab.
+Installing Human Eval
 
-## Installation
-*[Recommended]* Create a virtualenv or conda enviroment  
-```bash
-conda create -n nlp4code python=3.8
-conda activate nlp4code
 ```
-Then, install the dependencies:
-```bash
-pip install -r requirements.txt
+git clone https://github.com/openai/human-eval
+pip install -e human-eval
 ```
-*(Optional)* At any point, if you met with the Python import problem (e.g., `ModuleNotFoundError`), try doing this in the main (`NLP4Code`) directory:
-```bash
-export PYTHONPATH=`pwd`
-```
-
-To run LLAMA-based model, you need to install the development version of `transformers` library:
-```bash
-pip install git+https://github.com/huggingface/transformers
-```
-
-## Wandb
-We use Wandb for experiment tracking. Please register ask Ansong for an invitation to the Wandb Yale-LILY team before 
-running experiments. When you are ready to run the exps and log it to the cloud, do the following:
-```
-wandb login
-```
-Paste your API key and the login is complete. When start running experiments, you should see something like 
-```
-wandb: Tracking run with wandb version 0.12.11
-wandb: Run data is saved locally in /home/ansongni/Code/NLP4Code/wandb/run-20220309_150158-1ebacxm4
-wandb: Run `wandb offline` to turn off syncing.
-wandb: Syncing run mathqa-gpt-finetuning
-wandb: ⭐️ View project at https://wandb.ai/yale-lily/unified-codegen
-wandb: 🚀 View run at https://wandb.ai/yale-lily/unified-codegen/runs/1ebacxm4
-```
-
-If you want to do some test runs without logging to the cloud, run `wandb offline` first as suggested above. 
+Creating JSONL files
 
-## Naming of the experiments
-In the $*.yaml$ configuration file, you should see a line like
 ```
-default_root_dir: &exp_name results/mathqa-gpt_neo_1.3B-finetuning
-```
-We automatically get the experiment name by the string after `/`, the tags for the experiments are automatically
-generated by spliting that string by `-`. In this case, the experiment will be named `mathqa-gpt_neo_1.3B-finetuning`
-and the tags will be `["mathqa", "gpt_neo_1.3B", "finetuning"]`. Please follow this convention so that we can write all
-of this in one place.
-
-## Fine-tuning
-(Read the previous sections first if you are ready to run experiments)
-For fine-tuning, in the main directory, do:
-```
-python finetuning/trainer.py fit --config finetuning/training_configs/*.yaml
-```
-
-## Testing
-There are some basic tests in the `tests` folder, to run all the tests (follow [this link](https://docs.python.org/3/library/unittest.html#command-line-interface) for more):
-To run tests, do
-```bash
-python -m unittest discover <test_directory>
-# or
-python -m unittest discover -s <directory> -p '*_test.py'
+python preprocess/preprocess_humaneval.py
 ```
\ No newline at end of file

From fbdab1fccc16c703fd6c7fdca771e30792188335 Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Fri, 14 Jul 2023 09:58:17 -0400
Subject: [PATCH 10/18] updated humaneval.yaml

---
 finetuning/training_configs/few_shot/humaneval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finetuning/training_configs/few_shot/humaneval.yaml b/finetuning/training_configs/few_shot/humaneval.yaml
index 63e1308e..88d3ac8b 100644
--- a/finetuning/training_configs/few_shot/humaneval.yaml
+++ b/finetuning/training_configs/few_shot/humaneval.yaml
@@ -50,7 +50,7 @@ data:
     ## prompting settings
     prompting_init_args:
       exemplar_file_path: prompt_files/humaneval-8_exemplars.jsonl
-      num_exemplars: 3
+      num_exemplars: 8
       fixed_exemplars: true
       exemplar_selection_method: first
       add_instruction: true

From b55d5c79f5a935cf4d051ee0f64c8e6a5269ca0b Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Thu, 20 Jul 2023 15:35:20 -0400
Subject: [PATCH 11/18] added mkdir code

---
 .../lightning_modules/models/seq2seq_model.py |  1 +
 .../training_configs/few_shot/humaneval.yaml  | 28 +++++++++----------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/finetuning/lightning_modules/models/seq2seq_model.py b/finetuning/lightning_modules/models/seq2seq_model.py
index 885724f4..5b7acaf4 100644
--- a/finetuning/lightning_modules/models/seq2seq_model.py
+++ b/finetuning/lightning_modules/models/seq2seq_model.py
@@ -391,6 +391,7 @@ def validation_epoch_end(self, outputs: List[Dict[str, Any]]) -> None:
         # save the predictions
         save_pred_file_path = os.path.join(self.trainer.log_dir,
                                 f'predictions_step_{self.trainer.global_step}_rank_{self.trainer.global_rank}.jsonl')
+        os.makedirs(os.path.dirname(save_pred_file_path), exist_ok=True)
         with open(save_pred_file_path, 'w+') as f:
             for prediction in self.predictions:
                 f.write(json.dumps(prediction)+'\n')
diff --git a/finetuning/training_configs/few_shot/humaneval.yaml b/finetuning/training_configs/few_shot/humaneval.yaml
index 88d3ac8b..3fee1319 100644
--- a/finetuning/training_configs/few_shot/humaneval.yaml
+++ b/finetuning/training_configs/few_shot/humaneval.yaml
@@ -4,30 +4,30 @@ trainer:
   # progress_bar_refresh_rate: 1
   num_sanity_val_steps: 0
   log_every_n_steps: 1
-  logger+:
-    - class_path: finetuning.lightning_modules.patches.patched_loggers.PatchedWandbLogger
-      init_args:
-        entity: yale-lily
-        project: unified-codegen
-        save_dir: *exp_name
-        name: *exp_name
-        log_model: False
-        save_code: True
-        offline: False
-        # offline: True
+  # logger+:
+  #   - class_path: finetuning.lightning_modules.patches.patched_loggers.PatchedWandbLogger
+  #     init_args:
+  #       entity: yale-lily
+  #       project: unified-codegen
+  #       save_dir: *exp_name
+  #       name: *exp_name
+  #       log_model: False
+  #       save_code: True
+  #       offline: False
+  #       # offline: True
   callbacks+:
     - class_path: pytorch_lightning.callbacks.progress.TQDMProgressBar
       init_args:
         refresh_rate: 1
         
   accelerator: gpu
-  devices: 2
+  devices: 1
   # strategy: deepspeed_stage_2
   strategy: ddp_find_unused_parameters_false
   precision: 16
 
 model:
-  class_path: lightning_modules.models.seq2seq_model.Seq2SeqModel
+  class_path: finetuning.lightning_modules.models.seq2seq_model.Seq2SeqModel
   init_args:
     transformer_model_name: default-will-cause-error
     executor_cls: execution.executors.HumanEvalExecutor
@@ -41,7 +41,7 @@ model:
     # print_eval_every_n_batches: 1
 
 data:
-  class_path: lightning_modules.datasets.base_datamodule.FewShotNL2CodeDataModule
+  class_path: finetuning.lightning_modules.datasets.base_datamodule.FewShotNL2CodeDataModule
   init_args:
     transformer_model_name: default-will-cause-error
     dataset_cls: FewShotHumanEvalDataset

From abd3318d7d30d2d325b0bfdf6e554fd1a404e813 Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Mon, 24 Jul 2023 21:23:21 -0400
Subject: [PATCH 12/18] fixed execution rate always being 1

---
 execution/executors.py                        |  3 +++
 .../training_configs/few_shot/humaneval.yaml  | 22 +++++++++----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/execution/executors.py b/execution/executors.py
index 23ecea52..62246850 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -403,5 +403,8 @@ def real_exec_program(cls, program: str, example: Dict[str, Any]) -> Tuple[int,
         result_dict = check_correctness(eval_dict, program, timeout=5)
         exec_match = result_dict['passed']
         exec_result = result_dict['result']
+        
+        if exec_match < 1 and exec_result.strip() == "failed:":
+            exec_match = -1
 
         return exec_match, exec_result
diff --git a/finetuning/training_configs/few_shot/humaneval.yaml b/finetuning/training_configs/few_shot/humaneval.yaml
index 3fee1319..7f539ceb 100644
--- a/finetuning/training_configs/few_shot/humaneval.yaml
+++ b/finetuning/training_configs/few_shot/humaneval.yaml
@@ -4,17 +4,17 @@ trainer:
   # progress_bar_refresh_rate: 1
   num_sanity_val_steps: 0
   log_every_n_steps: 1
-  # logger+:
-  #   - class_path: finetuning.lightning_modules.patches.patched_loggers.PatchedWandbLogger
-  #     init_args:
-  #       entity: yale-lily
-  #       project: unified-codegen
-  #       save_dir: *exp_name
-  #       name: *exp_name
-  #       log_model: False
-  #       save_code: True
-  #       offline: False
-  #       # offline: True
+  logger+:
+    - class_path: finetuning.lightning_modules.patches.patched_loggers.PatchedWandbLogger
+      init_args:
+        entity: yale-lily
+        project: unified-codegen
+        save_dir: *exp_name
+        name: *exp_name
+        log_model: False
+        save_code: True
+        offline: False
+        # offline: True
   callbacks+:
     - class_path: pytorch_lightning.callbacks.progress.TQDMProgressBar
       init_args:

From 3cc48d2d750276f363efd783786defa6640629b0 Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Mon, 24 Jul 2023 21:26:10 -0400
Subject: [PATCH 13/18] fixed exec_rate error

---
 execution/executors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/execution/executors.py b/execution/executors.py
index 62246850..1d35d86c 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -404,7 +404,7 @@ def real_exec_program(cls, program: str, example: Dict[str, Any]) -> Tuple[int,
         exec_match = result_dict['passed']
         exec_result = result_dict['result']
         
-        if exec_match < 1 and exec_result.strip() == "failed:":
+        if exec_match < 1 and exec_result.strip() != "failed:":
             exec_match = -1
 
         return exec_match, exec_result

From efd75f9ef394b0ce10d4d0d2fe571e451ffa7ec4 Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Wed, 26 Jul 2023 22:28:24 -0400
Subject: [PATCH 14/18] execution now uses codex prompt and cutoff method

---
 execution/executors.py                               | 12 +++++++++++-
 .../lightning_modules/datasets/humaneval_reader.py   |  6 +++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/execution/executors.py b/execution/executors.py
index 1d35d86c..bf2b21e7 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -388,7 +388,17 @@ def gold_program_len(self, example: Dict[str, Any]) -> int:
     # TODO: modify this later based on generated programs
     @overrides
     def process_output(self, output: str, tokenizer_eos_token: str) -> str:
-        return output.split("### Task End ###")[0]
+        stop_sequence = [ '\nclass', '\ndef', '\n#', '\nif', '\nprint']
+        min_index = len(output)  # Initialize with a large value
+        for substring in stop_sequence:
+            index = output.find(substring)
+            if index != -1 and index < min_index:
+                min_index = index
+        
+        if min_index < len(output):
+            return output[:min_index]
+        else:
+            return output
 
     @overrides
     def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:
diff --git a/finetuning/lightning_modules/datasets/humaneval_reader.py b/finetuning/lightning_modules/datasets/humaneval_reader.py
index 9328f782..4cceaf94 100644
--- a/finetuning/lightning_modules/datasets/humaneval_reader.py
+++ b/finetuning/lightning_modules/datasets/humaneval_reader.py
@@ -14,7 +14,7 @@
 
 class FewShotHumanEvalDataset(FewShotNL2CodeDataset):
 
-    instruction: str = "## Given the function header description, complete the python function."
+    instruction: str = ""
     example_io_sep: str = "\n"
 
     @overrides
@@ -30,6 +30,6 @@ def promptify_example(self, example: Dict[str, Any], add_code: bool = True,
         header = example["prompt"]
 
         if add_code:
-            return f'### Task Start ###\n{header}', f'{example["canonical_solution"]}\n### Task End ###'
+            return header, f'{example["canonical_solution"]}\n\n'
         else:
-            return f'### Task Start ###\n{header}', ''
\ No newline at end of file
+            return header, ''
\ No newline at end of file

From e6d556850dfaa1f88e0d194837b1902e0afc0256 Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Mon, 31 Jul 2023 22:49:37 -0400
Subject: [PATCH 15/18] no cutoff at execution to check for raw output

---
 execution/executors.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/execution/executors.py b/execution/executors.py
index bf2b21e7..16321f89 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -388,17 +388,18 @@ def gold_program_len(self, example: Dict[str, Any]) -> int:
     # TODO: modify this later based on generated programs
     @overrides
     def process_output(self, output: str, tokenizer_eos_token: str) -> str:
-        stop_sequence = [ '\nclass', '\ndef', '\n#', '\nif', '\nprint']
-        min_index = len(output)  # Initialize with a large value
-        for substring in stop_sequence:
-            index = output.find(substring)
-            if index != -1 and index < min_index:
-                min_index = index
+        return output # no cutoff to check raw output
+        # stop_sequence = [ '\nclass', '\ndef', '\n#', '\nif', '\nprint']
+        # min_index = len(output)  # Initialize with a large value
+        # for substring in stop_sequence:
+        #     index = output.find(substring)
+        #     if index != -1 and index < min_index:
+        #         min_index = index
         
-        if min_index < len(output):
-            return output[:min_index]
-        else:
-            return output
+        # if min_index < len(output):
+        #     return output[:min_index]
+        # else:
+        #     return output
 
     @overrides
     def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:

From 07695303e9e33b04d3320be23eb3eeef6a969cd1 Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Wed, 2 Aug 2023 14:16:26 -0400
Subject: [PATCH 16/18] updated gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 2b10e219..f573c1cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -139,3 +139,6 @@ results/
 # defined by rui
 DS_1000/
 .DS_Store
+NLP4Code_humaneval_outputs
+NLP4Code_ds1000_outputs
+raw_output_evaluation.py

From 7042d2ea5f141cbab41d71c20b7bc9c4ce7721c6 Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Wed, 2 Aug 2023 19:45:11 -0400
Subject: [PATCH 17/18] execution now preprocess model output

---
 execution/executors.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/execution/executors.py b/execution/executors.py
index 16321f89..bf2b21e7 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -388,18 +388,17 @@ def gold_program_len(self, example: Dict[str, Any]) -> int:
     # TODO: modify this later based on generated programs
     @overrides
     def process_output(self, output: str, tokenizer_eos_token: str) -> str:
-        return output # no cutoff to check raw output
-        # stop_sequence = [ '\nclass', '\ndef', '\n#', '\nif', '\nprint']
-        # min_index = len(output)  # Initialize with a large value
-        # for substring in stop_sequence:
-        #     index = output.find(substring)
-        #     if index != -1 and index < min_index:
-        #         min_index = index
+        stop_sequence = [ '\nclass', '\ndef', '\n#', '\nif', '\nprint']
+        min_index = len(output)  # Initialize with a large value
+        for substring in stop_sequence:
+            index = output.find(substring)
+            if index != -1 and index < min_index:
+                min_index = index
         
-        # if min_index < len(output):
-        #     return output[:min_index]
-        # else:
-        #     return output
+        if min_index < len(output):
+            return output[:min_index]
+        else:
+            return output
 
     @overrides
     def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:

From 91a1d67db4199eb54d84dc253542d095b2718860 Mon Sep 17 00:00:00 2001
From: rays1024 <raysburnaby@gmail.com>
Date: Sat, 5 Aug 2023 11:47:21 -0400
Subject: [PATCH 18/18] add one space in front of model output if needed

---
 execution/executors.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/execution/executors.py b/execution/executors.py
index bf2b21e7..a02aa6d7 100644
--- a/execution/executors.py
+++ b/execution/executors.py
@@ -394,11 +394,17 @@ def process_output(self, output: str, tokenizer_eos_token: str) -> str:
             index = output.find(substring)
             if index != -1 and index < min_index:
                 min_index = index
-        
+ 
         if min_index < len(output):
-            return output[:min_index]
+            processed_output = output[:min_index]
         else:
-            return output
+            processed_output = output
+
+        # for llama, gpt4_alpaca_lora, alpaca_lora_7b, the model output may be missing a space
+        if processed_output.startswith("   "):
+            processed_output = " " + processed_output
+
+        return processed_output
 
     @overrides
     def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool: