Skip to content

Commit

Permalink
Add desc to map/filter (#1162)
Browse files Browse the repository at this point in the history
* Add desc to map/filter

* update descriptions

---------

Co-authored-by: Wing Lian <[email protected]>
  • Loading branch information
casper-hansen and winglian authored Jan 23, 2024
1 parent cda52dc commit 6840381
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 2 deletions.
5 changes: 4 additions & 1 deletion src/axolotl/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,10 @@ def ultra_apply_chatml(sample): # pylint: disable=possibly-unused-variable
for i, data_set in enumerate(train_datasets):
_type = cfg.datasets[i]["type"]
ds_type_fn = locals()[_type]
train_datasets[i] = data_set.map(ds_type_fn)
train_datasets[i] = data_set.map(
ds_type_fn,
desc="Mapping RL Dataset",
)
train_dataset = concatenate_datasets(train_datasets)

# eval_dataset = eval_dataset.map(intel_apply_chatml)
Expand Down
1 change: 1 addition & 0 deletions src/axolotl/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def process(self, dataset):
num_proc=num_proc,
remove_columns=features,
keep_in_memory=self.keep_in_memory,
desc="Tokenizing Prompts",
**map_kwargs,
)

Expand Down
1 change: 1 addition & 0 deletions src/axolotl/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,7 @@ def load_pretraining_dataset(path, tokenizer, cfg, name=None, max_tokens=2048, s
# remove all the existing columns after mapping since they end up having
# a different length than the encoded/tokenized column
remove_columns=dataset.features.keys(),
desc="Encoding Pretraining",
)
return dataset

Expand Down
11 changes: 10 additions & 1 deletion src/axolotl/utils/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,33 +134,38 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
drop_long,
num_proc=cfg.dataset_processes,
load_from_cache_file=not cfg.is_preprocess,
desc="Dropping Long Sequences",
)
if eval_dataset:
eval_dataset = eval_dataset.filter(
drop_long,
num_proc=cfg.dataset_processes,
load_from_cache_file=not cfg.is_preprocess,
desc="Dropping Long Sequences",
)

if cfg.group_by_length:
train_dataset = train_dataset.map(
add_length,
num_proc=cfg.dataset_processes,
load_from_cache_file=not cfg.is_preprocess,
desc="Group By Length",
)

if cfg.sample_packing:
train_dataset = train_dataset.map(
add_position_ids,
num_proc=cfg.dataset_processes,
load_from_cache_file=not cfg.is_preprocess,
desc="Add position_id column (Sample Packing)",
)
if cfg.eval_sample_packing is not False:
if eval_dataset:
eval_dataset = eval_dataset.map(
add_position_ids,
num_proc=cfg.dataset_processes,
load_from_cache_file=not cfg.is_preprocess,
desc="Add position_id column (Sample Packing)",
)

return train_dataset, eval_dataset
Expand All @@ -169,9 +174,13 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
def process_pretraining_datasets_for_packing(train_dataset, sequence_len):
drop_long = partial(drop_long_seq, sequence_len=sequence_len)

train_dataset = train_dataset.filter(drop_long)
train_dataset = train_dataset.filter(
drop_long,
desc="Dropping Long Sequences",
)
train_dataset = train_dataset.map(
add_position_ids,
desc="Add position_id column (Pretraining Sample Packing)",
)
return train_dataset

Expand Down

0 comments on commit 6840381

Please sign in to comment.