Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the IndexError in get_response #618

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions DataCollator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from dataclasses import dataclass
from transformers.data.data_collator import *

@dataclass
class DataCollatorForSeq2Seq:
"""
Data collator that will dynamically pad the inputs received, as well as the labels.

Args:
tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
The tokenizer used for encoding the data.
model ([`PreTrainedModel`]):
The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
prepare the *decoder_input_ids*

This is useful when using *label_smoothing* to avoid calculating loss twice.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:

- `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
sequence is provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.

This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
label_pad_token_id (`int`, *optional*, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
return_tensors (`str`):
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
"""

tokenizer: PreTrainedTokenizerBase
model: Optional[Any] = None
padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
label_pad_token_id: int = -100
return_tensors: str = "pt"

def __call__(self, features, return_tensors=None):
if return_tensors is None:
return_tensors = self.return_tensors
labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
# same length to return tensors.
if labels is not None:
max_label_length = max(len(l) for l in labels)
if self.pad_to_multiple_of is not None:
max_label_length = (
(max_label_length + self.pad_to_multiple_of - 1)
// self.pad_to_multiple_of
* self.pad_to_multiple_of
)

padding_side = self.tokenizer.padding_side
for feature in features:
remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
if isinstance(feature["labels"], list):
feature["labels"] = (
feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
)
elif padding_side == "right":
feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
else:
feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)

features = self.tokenizer.pad(
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=return_tensors,
)

# prepare decoder_input_ids
if (
labels is not None
and self.model is not None
and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
):
decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"])
features["decoder_input_ids"] = decoder_input_ids

print(features)
exit()

return features
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@

Forked from Alpaca-LoRA, modify some parts of the code for my cutomized usage.

- add batched evaluation.
- add metric calculation.
- fix the error from `torch.complie` and the PEFT APIs. see [issue](https://github.com/tloen/alpaca-lora/issues/609)
- ...





# 🦙🌲🤏 Alpaca-LoRA

- 🤗 **Try the pretrained model out [here](https://huggingface.co/spaces/tloen/alpaca-lora), courtesy of a GPU grant from Huggingface!**
Expand Down
133 changes: 133 additions & 0 deletions calculate_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import argparse
import copy
import json
import os

import numpy as np

from calculate_metrics_src import compute_grouped_metrics_v2, compute_metrics


def process_superni(superni_preds, superni_meta):
assert len(superni_preds) == len(superni_meta), "The length of the predictions {} and the metadata {} should be the same".format(len(superni_preds), len(superni_meta))
final_res = []
for pred, meta in zip(superni_preds, superni_meta):
# to ensure the order are the same
assert pred["input"] == meta["input"], "The input of the prediction {} and the metadata {} should be the same".format(pred["input"], meta["input"])
assert pred["instruction"] == meta["instruction"], "The instruction of the prediction {} and the metadata {} should be the same".format(pred["instruction"], meta["instruction"])
item = copy.deepcopy(meta)
item["response"] = pred["response"]
final_res.append(item)

return final_res

def calculate_metrics(all_results, save_path=None, save_prefix=None):
instructions, inputs, outputs, responses = [], [], [], []
categoreis = []
for result in all_results:
instruction = result["instruction"]
input = result["input"]
output = result["output"]
response = result["response"]
if "categories" in result:
# superni
categoreis.append(result["categories"])
assert type(output) == list, "The output of superni should be a list, but got {}, save_prefix: {}".format(output, save_prefix)
outputs.append(output) # the output of the superni is already a list
else:
# p3, mmlu, bbh
assert type(output) == str and type(output) != list, "The output of p3, mmlu, and bbh should be a string (only superni is a list), but got {}, save_prefix: {}".format(output, save_prefix)
outputs.append([output]) # we expect the item in the list to be a list (cuz the `metric_max_over_ground_truths`)

instructions.append(instruction)
inputs.append(input)
responses.append(response)

# calculate the metrics
if len(categoreis) == 0:
categoreis = None

metrics = compute_ni_metrics(responses, outputs, instructions, inputs, categories=categoreis, save_path=save_path, save_prefix=save_prefix)

return metrics


def compute_ni_metrics(preds:list, references:list, instructions:list, inputs:list, categories=None, save_prefix=None, save_path=None):
decoded_preds = preds
result = compute_metrics(predictions=decoded_preds, references=references)
categories = ["_".join(it[0].lower().split()) for it in categories] if categories is not None else None
if categories is not None:
result_per_category = compute_grouped_metrics_v2(predictions=decoded_preds, references=references, groups=categories)
result.update(result_per_category)
prediction_lens = [len(pred.split()) for pred in decoded_preds] # this gen len is different from the origin code, which is the word length instead of token length
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}

assert save_path is not None and save_prefix is not None, "The save_path and save_prefix should not be None"

if save_path is not None and save_prefix is not None:
with open(os.path.join(save_path, f"{save_prefix}_eval_predictions.jsonl"), "w") as fout:
for instruction, input, output, pred in zip(instructions, inputs, references, decoded_preds):
fout.write(json.dumps({
"Definition": instruction,
"Input": input,
"Output": output,
"Prediction": pred
}) + "\n")
# save the scores
with open(os.path.join(save_path, f"{save_prefix}_eval_scores.json"), "w") as fout:
json.dump(result, fout, indent=4)



def main():
parser = argparse.ArgumentParser()
parser.add_argument("--results_path",type=str,default="./alpaca_2")

args, unparsed = parser.parse_known_args()
if unparsed:
raise ValueError(unparsed)


# read and calculate the metrics on all the four benchmarks
# superni
# note that the supernni should also read the 'superni_test_11810_eval_usage.json' file
# cuz we have to use the categories in that file to calcultae the grouped metrics
with open(os.path.join(args.results_path, "superni.json"), "r") as fin:
superni_preds = json.load(fin)

with open("/data/rml6079/projects/muffin_llama/alpaca-lora/eval_benchmarks/superni_test_11810_eval_usage.json", "r") as fin:
superni_meta = json.load(fin)

# combine these two files to get the final list to calculate the metrics.
# the only difference of superni_results, is that it has the response field, and the output of the superni is a list instead of a string
superni_results = process_superni(superni_preds, superni_meta)
calculate_metrics(superni_results, save_path=args.results_path, save_prefix="superni")
print("superni done")

# p3
with open(os.path.join(args.results_path, "p3.json"), "r") as fin:
p3_results = json.load(fin)
calculate_metrics(p3_results, save_path=args.results_path, save_prefix="p3")
print("p3 done")

# mmlu
with open(os.path.join(args.results_path, "mmlu.json"), "r") as fin:
mmlu_results = json.load(fin)
calculate_metrics(mmlu_results, save_path=args.results_path, save_prefix="mmlu")
print("mmlu done")

# bbh
with open(os.path.join(args.results_path, "bbh.json"), "r") as fin:
bbh_results = json.load(fin)
calculate_metrics(bbh_results, save_path=args.results_path, save_prefix="bbh")
print("bbh done")







if __name__ == "__main__":
main()
Loading