Skip to content

Commit

Permalink
quality
Browse files Browse the repository at this point in the history
  • Loading branch information
pacman100 committed Mar 12, 2024
1 parent e54c97d commit b7a8edb
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 42 deletions.
40 changes: 10 additions & 30 deletions examples/sft/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ class ModelArguments:
"""

model_name_or_path: str = field(
metadata={
"help": "Path to pretrained model or model identifier from huggingface.co/models"
}
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
chat_template_format: Optional[str] = field(
default="none",
Expand All @@ -31,9 +29,7 @@ class ModelArguments:
lora_r: Optional[int] = field(default=64)
lora_target_modules: Optional[str] = field(
default="q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj",
metadata={
"help": "comma separated list of target modules to apply LoRA layers to"
},
metadata={"help": "comma separated list of target modules to apply LoRA layers to"},
)
use_nested_quant: Optional[bool] = field(
default=False,
Expand Down Expand Up @@ -87,21 +83,15 @@ class DataTrainingArguments:
default=False,
metadata={"help": "Use packing dataset creating."},
)
dataset_text_field: str = field(
default="text", metadata={"help": "Dataset field to use as input text."}
)
dataset_text_field: str = field(default="text", metadata={"help": "Dataset field to use as input text."})
max_seq_length: Optional[int] = field(default=512)
append_concat_token: Optional[bool] = field(
default=False,
metadata={
"help": "If True, appends `eos_token_id` at the end of each sample being packed."
},
metadata={"help": "If True, appends `eos_token_id` at the end of each sample being packed."},
)
add_special_tokens: Optional[bool] = field(
default=False,
metadata={
"help": "If True, tokenizers adds special tokens to each sample being packed."
},
metadata={"help": "If True, tokenizers adds special tokens to each sample being packed."},
)
splits: Optional[str] = field(
default="train,test",
Expand All @@ -114,19 +104,13 @@ def main(model_args, data_args, training_args):
set_seed(training_args.seed)

# model
model, peft_config, tokenizer = create_and_prepare_model(
model_args, data_args, training_args
)
model, peft_config, tokenizer = create_and_prepare_model(model_args, data_args, training_args)

# gradient ckpt
model.config.use_cache = not training_args.gradient_checkpointing
training_args.gradient_checkpointing = (
training_args.gradient_checkpointing and not model_args.use_unsloth
)
training_args.gradient_checkpointing = training_args.gradient_checkpointing and not model_args.use_unsloth
if training_args.gradient_checkpointing:
training_args.gradient_checkpointing_kwargs = {
"use_reentrant": model_args.use_reentrant
}
training_args.gradient_checkpointing_kwargs = {"use_reentrant": model_args.use_reentrant}

# datasets
train_dataset, eval_dataset = create_datasets(
Expand Down Expand Up @@ -168,15 +152,11 @@ def main(model_args, data_args, training_args):


if __name__ == "__main__":
parser = HfArgumentParser(
(ModelArguments, DataTrainingArguments, TrainingArguments)
)
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(
json_file=os.path.abspath(sys.argv[1])
)
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
main(model_args, data_args, training_args)
16 changes: 4 additions & 12 deletions examples/sft/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ def preprocess(samples):
elif "test" in split:
raw_datasets["test"] = dataset
else:
raise ValueError(
f"Split type {split} not recognized as one of test or train."
)
raise ValueError(f"Split type {split} not recognized as one of test or train.")

if apply_chat_template:
raw_datasets = raw_datasets.map(
Expand All @@ -77,9 +75,7 @@ def preprocess(samples):

train_data = raw_datasets["train"]
valid_data = raw_datasets["test"]
print(
f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}"
)
print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")
print(f"A sample of train dataset: {train_data[0]}")

return train_data, valid_data
Expand Down Expand Up @@ -115,9 +111,7 @@ def create_and_prepare_model(args, data_args, training_args):
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print(
"Your GPU supports bfloat16, you can accelerate training with the argument --bf16"
)
print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
print("=" * 80)
elif args.use_8bit_quantization:
bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization)
Expand Down Expand Up @@ -175,9 +169,7 @@ def create_and_prepare_model(args, data_args, training_args):
# make embedding resizing configurable?
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
else:
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

if args.use_unsloth:
Expand Down

0 comments on commit b7a8edb

Please sign in to comment.