diff --git a/xtuner/configs/llava/phi3_mini_4k_instruct_clip_vit_large_p14_336/finetune/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_4k_instruct_clip_vit_large_p14_336/finetune/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_e1_gpu8_finetune.py new file mode 100644 index 000000000..a1d3cbcd8 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_4k_instruct_clip_vit_large_p14_336/finetune/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_e1_gpu8_finetune.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/Phi-3-mini-4k-instruct' +visual_encoder_name_or_path = 'openai/clip-vit-large-patch14-336' +# Specify the pretrained pth +pretrained_pth = './work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/phi3_mini_4k_instruct_clip_vit_large_p14_336/pretrain/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi3_mini_4k_instruct_clip_vit_large_p14_336/pretrain/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py new file mode 100644 index 000000000..cdd4bb484 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_4k_instruct_clip_vit_large_p14_336/pretrain/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/Phi-3-mini-4k-instruct' +visual_encoder_name_or_path = 'openai/clip-vit-large-patch14-336' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/phi3_mini_4k_instruct_clip_vit_large_p14_336/pretrain/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_sharegpt4v_pretrain.py b/xtuner/configs/llava/phi3_mini_4k_instruct_clip_vit_large_p14_336/pretrain/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_sharegpt4v_pretrain.py new file mode 100644 index 000000000..e74b12097 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_4k_instruct_clip_vit_large_p14_336/pretrain/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_sharegpt4v_pretrain.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/Phi-3-mini-4k-instruct' +visual_encoder_name_or_path = 'openai/clip-vit-large-patch14-336' + +# Data +data_root = './data/sharegpt4v/' +data_path = data_root + 'share-captioner_coco_lcs_sam_1246k_1107.json' +image_folder = data_root + 'data' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(4096 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/phi/phi3/phi3_mini_128k_instruct_full_alpaca_e3.py b/xtuner/configs/phi/phi3/phi3_mini_128k_instruct_full_alpaca_e3.py new file mode 100644 index 000000000..d60f67533 --- /dev/null +++ b/xtuner/configs/phi/phi3/phi3_mini_128k_instruct_full_alpaca_e3.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import AutoModelForCausalLM, AutoTokenizer + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory +from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, + VarlenAttnArgsToMessageHubHook) +from xtuner.engine.runner import TrainLoop +from xtuner.model import SupervisedFinetune +from xtuner.parallel.sequence import SequenceParallelSampler +from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'microsoft/Phi-3-mini-128k-instruct' +use_varlen_attn = False + +# Data +alpaca_en_path = 'tatsu-lab/alpaca' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = 128 * 1024 +pack_to_max_length = True + +# parallel +sequence_parallel_size = 1 + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +accumulative_counts *= sequence_parallel_size +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = SYSTEM_TEMPLATE.alpaca +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + use_varlen_attn=use_varlen_attn, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length, + use_varlen_attn=use_varlen_attn) + +sampler = SequenceParallelSampler \ + if sequence_parallel_size > 1 else DefaultSampler +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=sampler, shuffle=True), + collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + system=SYSTEM, + prompt_template=prompt_template) +] + +if use_varlen_attn: + custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/phi/phi3/phi3_mini_128k_instruct_qlora_alpaca_e3.py b/xtuner/configs/phi/phi3/phi3_mini_128k_instruct_qlora_alpaca_e3.py new file mode 100644 index 000000000..f528da716 --- /dev/null +++ b/xtuner/configs/phi/phi3/phi3_mini_128k_instruct_qlora_alpaca_e3.py @@ -0,0 +1,219 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory +from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, + VarlenAttnArgsToMessageHubHook) +from xtuner.engine.runner import TrainLoop +from xtuner.model import SupervisedFinetune +from xtuner.parallel.sequence import SequenceParallelSampler +from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'microsoft/Phi-3-mini-128k-instruct' +use_varlen_attn = False + +# Data +alpaca_en_path = 'tatsu-lab/alpaca' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = 128 * 1024 +pack_to_max_length = True + +# parallel +sequence_parallel_size = 1 + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +accumulative_counts *= sequence_parallel_size +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = AdamW +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = SYSTEM_TEMPLATE.alpaca +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + use_varlen_attn=use_varlen_attn, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length, + use_varlen_attn=use_varlen_attn) + +sampler = SequenceParallelSampler \ + if sequence_parallel_size > 1 else DefaultSampler +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=sampler, shuffle=True), + collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + system=SYSTEM, + prompt_template=prompt_template) +] + +if use_varlen_attn: + custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/phi/phi3/phi3_mini_4k_instruct_full_alpaca_e3.py b/xtuner/configs/phi/phi3/phi3_mini_4k_instruct_full_alpaca_e3.py new file mode 100644 index 000000000..64f198d34 --- /dev/null +++ b/xtuner/configs/phi/phi3/phi3_mini_4k_instruct_full_alpaca_e3.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import AutoModelForCausalLM, AutoTokenizer + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory +from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, + VarlenAttnArgsToMessageHubHook) +from xtuner.engine.runner import TrainLoop +from xtuner.model import SupervisedFinetune +from xtuner.parallel.sequence import SequenceParallelSampler +from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'microsoft/Phi-3-mini-4k-instruct' +use_varlen_attn = False + +# Data +alpaca_en_path = 'tatsu-lab/alpaca' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = 4096 +pack_to_max_length = True + +# parallel +sequence_parallel_size = 1 + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +accumulative_counts *= sequence_parallel_size +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = SYSTEM_TEMPLATE.alpaca +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + use_varlen_attn=use_varlen_attn, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length, + use_varlen_attn=use_varlen_attn) + +sampler = SequenceParallelSampler \ + if sequence_parallel_size > 1 else DefaultSampler +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=sampler, shuffle=True), + collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + system=SYSTEM, + prompt_template=prompt_template) +] + +if use_varlen_attn: + custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/phi/phi3/phi3_mini_4k_instruct_qlora_alpaca_e3.py b/xtuner/configs/phi/phi3/phi3_mini_4k_instruct_qlora_alpaca_e3.py new file mode 100644 index 000000000..e90e17a14 --- /dev/null +++ b/xtuner/configs/phi/phi3/phi3_mini_4k_instruct_qlora_alpaca_e3.py @@ -0,0 +1,219 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory +from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook, + VarlenAttnArgsToMessageHubHook) +from xtuner.engine.runner import TrainLoop +from xtuner.model import SupervisedFinetune +from xtuner.parallel.sequence import SequenceParallelSampler +from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'microsoft/Phi-3-mini-4k-instruct' +use_varlen_attn = False + +# Data +alpaca_en_path = 'tatsu-lab/alpaca' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = 4096 +pack_to_max_length = True + +# parallel +sequence_parallel_size = 1 + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +accumulative_counts *= sequence_parallel_size +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = AdamW +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = SYSTEM_TEMPLATE.alpaca +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + use_varlen_attn=use_varlen_attn, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length, + use_varlen_attn=use_varlen_attn) + +sampler = SequenceParallelSampler \ + if sequence_parallel_size > 1 else DefaultSampler +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=sampler, shuffle=True), + collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + system=SYSTEM, + prompt_template=prompt_template) +] + +if use_varlen_attn: + custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index 19b427a75..d588b6cd5 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -196,11 +196,13 @@ def _prepare_for_long_context_training(cfg, llm_cfg, def _prepare_for_flash_attn(cfg, llm_cfg): cls_name = type(llm_cfg).__name__ SUPPORT_SDPA_ATTN = ('LlamaConfig', 'GemmaConfig', 'MistralConfig', - 'MixtralConfig', 'Qwen2Config', - 'Starcoder2Config', 'Starcoder2Config') + 'MixtralConfig', 'Qwen2Config', 'Qwen2MoeConfig', + 'Starcoder2Config', 'Starcoder2Config', + 'Phi3Config') SUPPORT_FLASH_ATTN2 = ('InternLM2Config', 'LlamaConfig', 'GemmaConfig', 'MistralConfig', 'MixtralConfig', 'Qwen2Config', - 'Starcoder2Config', 'Starcoder2Config') + 'Qwen2MoeConfig', 'Starcoder2Config', + 'Starcoder2Config', 'Phi3Config') if SUPPORT_FLASH2 and cls_name in SUPPORT_FLASH_ATTN2: cfg.torch_dtype = torch.bfloat16 \ diff --git a/xtuner/model/modules/dispatch/__init__.py b/xtuner/model/modules/dispatch/__init__.py index a154eabda..97f511ebe 100644 --- a/xtuner/model/modules/dispatch/__init__.py +++ b/xtuner/model/modules/dispatch/__init__.py @@ -93,6 +93,53 @@ def dispatch_llama_rmsnorm_forward(model): module.forward = types.MethodType(rms_norm_forward, module) +def dispatch_phi3_attn_forward(model, use_varlen_attn): + if use_varlen_attn: + assert SUPPORT_FLASH2 and SUPPORT_TRITON, \ + 'flash_attn and triton is required if you want to use varlen_attn.' + elif not SUPPORT_FLASH2: + return + + from .phi3 import phi3_attn_forward, phi3_varlen_attn_forward + + print_log(NO_ATTN_WEIGHTS_MSG, 'current', logging.WARNING) + for module in model.modules(): + # Do not need to dispatch if + # type(module).__name__ == 'Phi3SdpaAttention', as flash_attn is + # required when using sequence parallel + if type(module).__name__ in ('Phi3Attention', 'Phi3FlashAttention2'): + if use_varlen_attn: + print_log('dispatch phi3 varlen attn forward', 'current') + if IS_LOW_VERSION_TRANSFORMERS: + raise RuntimeError( + 'Phi-3 need transformers version >= 4.39, but got ' + f'{transformers.__version__}') + else: + module.forward = types.MethodType(phi3_varlen_attn_forward, + module) + else: + print_log('dispatch phi3 attn forward', 'current') + if IS_LOW_VERSION_TRANSFORMERS: + raise RuntimeError( + 'Phi-3 need transformers version >= 4.39, but got ' + f'{transformers.__version__}') + else: + module.forward = types.MethodType(phi3_attn_forward, + module) + + +def dispatch_phi3_rmsnorm_forward(model): + if not SUPPORT_TRITON: + return + + from .triton_kernels import rms_norm_forward + + for module in model.modules(): + if type(module).__name__ == 'Phi3RMSNorm': + print_log('dispatch phi3 rmsnorm forward', 'current') + module.forward = types.MethodType(rms_norm_forward, module) + + def dispatch_internlm_attn_forward(model, use_varlen_attn): if use_varlen_attn: assert SUPPORT_FLASH2 and SUPPORT_TRITON, \ @@ -405,6 +452,10 @@ def dispatch_modules(model, use_varlen_attn=False): dispatch_llama_attn_forward(model, use_varlen_attn) if USE_TRITON_KERNEL: dispatch_llama_rmsnorm_forward(model) + elif 'phi3' in model_name: + dispatch_phi3_attn_forward(model, use_varlen_attn) + if USE_TRITON_KERNEL: + dispatch_phi3_rmsnorm_forward(model) elif 'baichuan' in model_name: dispath_baichuan2_norm_head_forward(model) dispath_baichuan_7b_attn_forward(model) diff --git a/xtuner/model/modules/dispatch/phi3.py b/xtuner/model/modules/dispatch/phi3.py new file mode 100644 index 000000000..04a437a8e --- /dev/null +++ b/xtuner/model/modules/dispatch/phi3.py @@ -0,0 +1,452 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Optional, Tuple + +import torch +import torch.distributed as dist +from mmengine import MessageHub + +from xtuner.parallel.sequence import (get_sequence_parallel_world_size, + post_process_for_sequence_parallel_attn, + pre_process_for_sequence_parallel_attn) +from .attention import flash_attn_wo_mask, varlen_flash_attn + +try: + from transformers.cache_utils import Cache +except ImportError: + + class Cache: + pass + + +import inspect + +_flash_supports_window_size = False +try: + from flash_attn import flash_attn_func + + _flash_supports_window_size = 'window_size' in list( + inspect.signature(flash_attn_func).parameters) + + if not _flash_supports_window_size: + raise ValueError( + 'Please update flash-attention to support window size.') +# else: +except ImportError: + pass + + +# Copied from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/3a811845d89f3c1b3f41b341d0f9f05104769f35/modeling_phi3.py#L302 # noqa:E501 +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """This is the equivalent of torch.repeat_interleave(x, dim=1, + repeats=n_rep). + + The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to + (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, + None, :, :].expand(batch, + num_key_value_heads, + n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, + head_dim) + + +# https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/3a811845d89f3c1b3f41b341d0f9f05104769f35/modeling_phi3.py#L247 # noqa:E501 +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/3a811845d89f3c1b3f41b341d0f9f05104769f35/modeling_phi3.py#L255 # noqa:E501 +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ # noqa:E501 + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def phi3_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, +): + if not _flash_supports_window_size: + raise ValueError( + 'The current flash attention version does not support ' + 'sliding window attention.') + + output_attentions = False + + if 'padding_mask' in kwargs: + warnings.warn( + 'Passing `padding_mask` is deprecated and will be removed in ' + 'v4.37. Please make sure use `attention_mask` instead.`') + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop('padding_mask') + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos:query_pos + + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., + query_pos + self.num_key_value_heads * self.head_dim:] + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, + self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, + self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, + self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + 'The cache structure has changed since version v4.36. ' + f'If you are using {self.__class__.__name__} ' + 'for auto-regressive decoding with k/v caching, ' + 'please make sure to initialize the attention class ' + 'with a layer index.') + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, + self.layer_idx) + + rotary_seq_len = max(kv_seq_len, position_ids.max().item() + 1) + cos, sin = self.rotary_emb( + value_states, position_ids, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, + cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, 'sliding_window', None) is not None + and kv_seq_len > self.config.sliding_window) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value + # `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if (getattr(self.config, 'sliding_window', None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + 'past key must have a shape of (`batch_size, num_heads, ' + 'self.config.sliding_window-1, head_dim`), got' + f' {past_key.shape}') + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat( + [attention_mask, + torch.ones_like(attention_mask[:, -1:])], + dim=-1) + + cache_kwargs = {'sin': sin, 'cos': cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_dropout = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training + # stability reasons therefore the input hidden states gets silently + # casted in float32. Hence, we need cast them back in the correct dtype + # just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not + # cast the LayerNorms in fp32. + + if query_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, '_pre_quantization_dtype'): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.qkv_proj.weight.dtype + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + enable_sequence_parallel = ( + dist.is_initialized() and get_sequence_parallel_world_size() > 1 + and self.training) + if enable_sequence_parallel: + # (b, s // sp_world_size, nd, dim) -> (b, s, nd // sp_world_size, dim) + query_states, key_states, value_states = \ + pre_process_for_sequence_parallel_attn( + query_states, key_states, value_states, scatter_dim=2, gather_dim=1) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + query_states.shape[1], + dropout=attn_dropout, + use_sliding_windows=use_sliding_windows, + ) + + if enable_sequence_parallel: + # (b, s, nd // sp_world_size, dim) -> (b, s // sp_world_size, nd, dim) + attn_output = post_process_for_sequence_parallel_attn( + attn_output, scatter_dim=1, gather_dim=2) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +def phi3_varlen_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], + Optional[Tuple[torch.Tensor]]]: + if not _flash_supports_window_size: + raise ValueError( + 'The current flash attention version does not support ' + 'sliding window attention.') + + output_attentions = False + + is_training = self.training + + message_hub = MessageHub.get_instance('varlen_attn_args') + rank = dist.get_rank() + cumulative_len = message_hub.get_info(f'cumulative_len_rank_{rank}') + max_seqlen = message_hub.get_info(f'max_seqlen_rank_{rank}') + + assert is_training == (cumulative_len is not None) == ( + past_key_value is None) + + if 'padding_mask' in kwargs: + warnings.warn( + 'Passing `padding_mask` is deprecated and will be removed in v4.37' + ' Please make sure use `attention_mask` instead.`') + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop('padding_mask') + + bsz, q_len, _ = hidden_states.size() + assert bsz == 1, (f'If utilizing local attention, the batch size should be' + f' set to 1, but got {bsz}') + # attention_mask is set to None if no padding token in input_ids + # varlen attn need data packing so no padding tokens in input_ids + assert attention_mask is None + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos:query_pos + + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., + query_pos + self.num_key_value_heads * self.head_dim:] + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, + self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, + self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, + self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + 'The cache structure has changed since version v4.36. ' + f'If you are using {self.__class__.__name__} ' + 'for auto-regressive decoding with k/v caching, ' + 'please make sure to initialize the attention class ' + 'with a layer index.') + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, + self.layer_idx) + + assert position_ids is not None + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb( + value_states, position_ids, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, + cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, 'sliding_window', None) is not None + and kv_seq_len > self.config.sliding_window) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value + # `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if (getattr(self.config, 'sliding_window', None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + 'past key must have a shape of (`batch_size, num_heads, ' + 'self.config.sliding_window-1, head_dim`), got' + f' {past_key.shape}') + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat( + [attention_mask, + torch.ones_like(attention_mask[:, -1:])], + dim=-1) + + cache_kwargs = {'sin': sin, 'cos': cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # In PEFT, usually we cast the layer norms in float32 for + # training stability reasons, therefore the input hidden states gets + # silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + + if query_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, '_pre_quantization_dtype'): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.qkv_proj.weight.dtype + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + # ----------------- flash attention forward ------------------------# + + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + causal = self.is_causal and q_len != 1 + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, 'sliding_window', None) is not None + and kv_seq_len > self.config.sliding_window) + + window_size = (self.config.sliding_window, + self.config.sliding_window) if use_sliding_windows else (-1, + -1) + attn_dropout = self.attention_dropout if self.training else 0.0 + + if is_training: + attn_output = varlen_flash_attn( + query_states, + key_states, + value_states, + cumulative_len, + max_seqlen, + causal=causal, + dropout_p=attn_dropout, + window_size=window_size, + training=True) + else: + attn_output = flash_attn_wo_mask( + query_states, + key_states, + value_states, + causal=causal, + dropout_p=attn_dropout, + window_size=window_size, + training=False) + + # ---------------- flash attention forward end ------------------- # + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value diff --git a/xtuner/model/sft.py b/xtuner/model/sft.py index 2f8f52569..94da26789 100644 --- a/xtuner/model/sft.py +++ b/xtuner/model/sft.py @@ -179,11 +179,12 @@ def _prepare_for_flash_attn(cfg, llm_cfg): cls_name = type(llm_cfg).__name__ SUPPORT_SDPA_ATTN = ('LlamaConfig', 'GemmaConfig', 'MistralConfig', 'MixtralConfig', 'Qwen2Config', 'Qwen2MoeConfig', - 'Starcoder2Config', 'Starcoder2Config') + 'Starcoder2Config', 'Starcoder2Config', + 'Phi3Config') SUPPORT_FLASH_ATTN2 = ('InternLM2Config', 'LlamaConfig', 'GemmaConfig', 'MistralConfig', 'MixtralConfig', 'Qwen2Config', 'Qwen2MoeConfig', 'Starcoder2Config', - 'Starcoder2Config') + 'Starcoder2Config', 'Phi3Config') if SUPPORT_FLASH2 and cls_name in SUPPORT_FLASH_ATTN2: cfg.torch_dtype = torch.bfloat16 if ( diff --git a/xtuner/utils/templates.py b/xtuner/utils/templates.py index a0d7de710..a36a15371 100644 --- a/xtuner/utils/templates.py +++ b/xtuner/utils/templates.py @@ -151,6 +151,13 @@ SUFFIX='<|eot_id|>', SUFFIX_AS_EOS=True, STOP_WORDS=['<|eot_id|>']), + phi3_chat=dict( + SYSTEM='<|system|>{system}<|end|>\n', + INSTRUCTION='<|user|>{input}<|end|>\n<|assistant|>\n', + SUFFIX='<|end|>', + SUFFIX_AS_EOS=True, + SEP='\n', + STOP_WORDS=['<|end|>']), ) SYSTEM_TEMPLATE = ConfigDict(