tasks/train/config_pllava_nframe.py

from tasks.train.instruction_data import *

# ========================= data ==========================
# train_corpus = "videochat2_instruction"
train_corpus = "videochat2_instruction_full"

train_file = "${available_corpus[${train_corpus}]}"  # for lazy evaluation
test_file = dict()
test_types = []
num_workers = 8
save_steps=10000
ckpt_steps=1000
stop_key = None
deepspeed=False
# ========================= input ==========================
num_frames = 16
num_frames_test = 1
batch_size = 1
gradient_accumulation_steps=16
max_txt_l = 512
max_train_steps=None
pre_text = False
gradient_checkpointing=False
inputs = dict(
    image_res=336,
    video_input=dict(
        num_frames="${num_frames}",
        sample_type="rand",
        num_frames_test="${num_frames_test}",
        sample_type_test="middle",
        random_aug=False,
    ),
    max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
    batch_size=dict(image="${batch_size}", video="${batch_size}"),
    batch_size_test=dict(image="${batch_size}", video="${batch_size}"),
)

# ========================= model ==========================
model = dict(
    repo_id="llava-hf/llava-v1.6-vicuna-7b-hf",
    pretrained_path=None,
    load_from_origin=False,
    origin_vision="",
    origin_llm="",
    vision_encoder=dict(
        name="vit_l14", # somehow need this to tell the dataset the mean std of pretrained model
    ),
    torch_dtype='bfloat16',
    freeze_projector=False,
    freeze_lm=True,
    freeze_vision_tower=True,
    lora_target_modules=["q_proj", "v_proj"], # for llama/mistral/gemma
    use_lora=True,
    lora_r=128,
    lora_alpha=32,
    lora_dropout=0.05,
    num_frames="${num_frames}",
    pooling_method='avg',
    use_pooling=True,
    frame_shape=(24,24),
    pooling_shape=(16,8,8),
)
preprocess = dict(
    system="",
    mm_alone=True,
    random_shuffle=True,
    add_second_msg=True,
    roles=['USER:', 'ASSISTANT:'],
    end_signal=(' ', '</s>'),
    begin_signal='',
    dataset_image_placeholder='<Image></Image>',
    dataset_video_placeholder='<Video></Video>',
    image_token_index=32000,
    max_txt_l = "${max_txt_l}",
    ignore_index=-100, # same as torch softmax ignore index 
    center_pad=False,
    longest_edge=762,
    shortest_edge=336,
    clip_transform=False,
    num_frames="${num_frames}",
)


optimizer = dict(
    opt="adamW",
    lr=2e-5,
    opt_betas=[0.9, 0.999],  # default
    weight_decay=0.02,
    max_grad_norm=-1,  # requires a positive float, use -1 to disable
    # use a different lr for some modules, e.g., larger lr for new modules
    different_lr=dict(enable=False, module_names=[], lr=1e-3),
)

# scheduler = dict(sched="cosine", epochs=3, min_lr_multi=0.25, warmup_epochs=0.6)
# scheduler = dict(sched="cosine", epochs=3, min_lr_multi=0.25, warmup_epochs=0.6)
scheduler = dict(
    is_videochat2_custom=False,
    sched="cosine", 
    epochs=2, 
    warmup_ratio=0.2,
    min_lr_multi=0.25)

evaluate = False
deep_fusion = False
evaluation = dict(
    eval_frame_ensemble="concat",  # [concat, max, mean, lse]
    eval_x_only=False,
    k_test=128,
    eval_offload=True,  # offload gpu tensors to cpu to save memory.
)

fp16 = True
gradient_checkpointing = True

# ========================= wandb ==========================
wandb = dict(
    enable=False,
    entity="user",  # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
    project="videochat2",  # setup in your command line
)
dist_url = "env://"
device = "cuda"
mode = "it"

# ========================= others ==========================
output_dir = None  # output dir
resume = False  # if True, load optimizer and scheduler states as well
debug = False
log_freq = 5
metric_window_size=10 # window size for metric
seed = 42
report_to='tensorboard'
save_latest = True
auto_resume = True
pretrained_path = ""  # path to pretrained model weights, for resume only?