From f55bb8cfca0451df88c4964659944a43364399d1 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Tue, 1 Oct 2024 14:42:35 +0200 Subject: [PATCH 01/21] Add format checks --- src/together/constants.py | 23 +++++++++++ src/together/utils/files.py | 81 ++++++++++++++++++++++++++++++------- 2 files changed, 90 insertions(+), 14 deletions(-) diff --git a/src/together/constants.py b/src/together/constants.py index b4c9cf3b..3e3399e5 100644 --- a/src/together/constants.py +++ b/src/together/constants.py @@ -1,3 +1,5 @@ +import enum + # Session constants TIMEOUT_SECS = 600 MAX_SESSION_LIFETIME_SECS = 180 @@ -29,3 +31,24 @@ # expected columns for Parquet files PARQUET_EXPECTED_COLUMNS = ["input_ids", "attention_mask", "labels"] + + +class DatasetFormat(enum.Enum): + """Dataset format enum. + + Args: + enum (enum.Enum): Enum class for dataset format. + """ + + GENERAL = "general" + CONVERSATION = "conversation" + INSTRUCTION = "instruction" + + +JSONL_REQUIRED_COLUMNS_MAP = { + DatasetFormat.GENERAL: ["text"], + DatasetFormat.CONVERSATION: ["messages"], + DatasetFormat.INSTRUCTION: ["prompt", "completion"], +} +REQUIRED_COLUMNS_CONVERSATION = ["role", "content"] +POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"] \ No newline at end of file diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 875f4d84..ba342e4f 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -13,6 +13,10 @@ MIN_SAMPLES, NUM_BYTES_IN_GB, PARQUET_EXPECTED_COLUMNS, + JSONL_REQUIRED_COLUMNS_MAP, + REQUIRED_COLUMNS_CONVERSATION, + POSSIBLE_ROLES_CONVERSATION, + DatasetFormat ) @@ -88,6 +92,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["is_check_passed"] = False return report_dict + dataset_format = None with file.open() as f: # idx must be instantiated so decode errors (e.g. file is a tar) or empty files are caught idx = -1 @@ -104,23 +109,68 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["is_check_passed"] = False - if "text" not in json_line.keys(): - report_dict["text_field"] = False - report_dict["message"] = ( - f"Missing 'text' field was found on line {idx + 1} of the the input file. " - "Expected format: {'text': 'my sample string'}. " - ) - report_dict["is_check_passed"] = False - else: - # check to make sure the value of the "text" key is a string - if not isinstance(json_line["text"], str): - report_dict["key_value"] = False + if dataset_format is None: + for possible_format in JSONL_REQUIRED_COLUMNS_MAP: + if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]): + if dataset_format is not None: + report_dict["message"] = ( + "All samples in the dataset must have the same dataset format. " + f"Got {dataset_format} for the first line and {possible_format} " + f"for the {idx + 1} line." + ) + raise KeyError + dataset_format = possible_format + if dataset_format is None: report_dict["message"] = ( - f'Invalid value type for "text" key on line {idx + 1}. ' - f'Expected string. Found {type(json_line["text"])}.' + "Error parsing file. Could not detect a possible format for the line with the columns:\n" + f"{json_line.keys()}" ) + raise KeyError + for column in JSONL_REQUIRED_COLUMNS_MAP[dataset_format]: + if column not in json_line.keys(): + report_dict["text_field"] = False + report_dict["message"] = ( + f"Missing '{column}' field was found on line {idx + 1} of the the input file." + ) report_dict["is_check_passed"] = False + raise KeyError + + if dataset_format == DatasetFormat.CONVERSATION: + message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION] + if not isinstance(json_line[message_column], dict): + report_dict["key_value"] = False + report_dict["message"] = ( + f"Invalid format on line {idx + 1} of the input file. " + f"Expected dict. Found {type(json_line[message_column])}" + ) + raise KeyError + for column in REQUIRED_COLUMNS_CONVERSATION: + if column not in json_line[message_column].keys(): + report_dict["key_value"] = False + report_dict["message"] = ( + f"Missing '{column}' field was found on line {idx + 1} of the the input file." + ) + raise KeyError + else: + if isinstance(json_line[message_column][column], str): + report_dict["text_field"] = False + report_dict["message"] = ( + f"Invalid format on line {idx + 1} in the column {column} of the input file. " + f"Expected string. found {type(json_line[message_column][column])}" + ) + pass + else: + # check to make sure the value of the keys is a string + for column in JSONL_REQUIRED_COLUMNS_MAP[dataset_format]: + if not isinstance(json_line[column], str): + report_dict["key_value"] = False + report_dict["message"] = ( + f'Invalid value type for "{column}" key on line {idx + 1}. ' + f'Expected string. Found {type(json_line[column])}.' + ) + + report_dict["is_check_passed"] = False # make sure this is outside the for idx, line in enumerate(f): for loop if idx + 1 < MIN_SAMPLES: @@ -136,7 +186,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["load_json"] = True - except ValueError: + except ValueError as _: report_dict["load_json"] = False if idx < 0: report_dict["message"] = ( @@ -148,6 +198,9 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: f"Error parsing json payload. Unexpected format on line {idx + 1}." ) report_dict["is_check_passed"] = False + except KeyError as _: + report_dict["load_json"] = False + report_dict["is_check_passed"] = False if "text_field" not in report_dict: report_dict["text_field"] = True From 8c5106cb8284ccfb18dad851f2f1739faa595e0b Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Thu, 3 Oct 2024 11:43:38 +0200 Subject: [PATCH 02/21] add tests --- src/together/utils/files.py | 87 ++++++++++------ tests/unit/test_files_checks.py | 172 ++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+), 28 deletions(-) create mode 100644 tests/unit/test_files_checks.py diff --git a/src/together/utils/files.py b/src/together/utils/files.py index ba342e4f..5e7bdeae 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -109,21 +109,32 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["is_check_passed"] = False + current_format = None + for possible_format in JSONL_REQUIRED_COLUMNS_MAP: + if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]): + if current_format is None: + current_format = possible_format + elif current_format != possible_format: + report_dict["message"] = ( + "Found multiple dataset formats in the input file. " + f"Got {current_format} and {possible_format} on line {idx + 1}." + ) + raise KeyError + if current_format is None and dataset_format is None: + report_dict["message"] = ( + "Error parsing file. Could not detect a possible format for the line with the columns:\n" + f"{json_line.keys()}" + ) + raise KeyError + if dataset_format is None: - for possible_format in JSONL_REQUIRED_COLUMNS_MAP: - if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]): - if dataset_format is not None: - report_dict["message"] = ( - "All samples in the dataset must have the same dataset format. " - f"Got {dataset_format} for the first line and {possible_format} " - f"for the {idx + 1} line." - ) - raise KeyError - dataset_format = possible_format - if dataset_format is None: + dataset_format = current_format + elif current_format is not None: + if current_format != dataset_format: report_dict["message"] = ( - "Error parsing file. Could not detect a possible format for the line with the columns:\n" - f"{json_line.keys()}" + "All samples in the dataset must have the same dataset format. " + f"Got {dataset_format} for the first line and {current_format} " + f"for the {idx + 1} line." ) raise KeyError @@ -137,29 +148,49 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: raise KeyError if dataset_format == DatasetFormat.CONVERSATION: - message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION] - if not isinstance(json_line[message_column], dict): + message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION][0] + if not isinstance(json_line[message_column], list): report_dict["key_value"] = False report_dict["message"] = ( f"Invalid format on line {idx + 1} of the input file. " - f"Expected dict. Found {type(json_line[message_column])}" + f"Expected list. Found {type(json_line[message_column])}" ) raise KeyError for column in REQUIRED_COLUMNS_CONVERSATION: - if column not in json_line[message_column].keys(): + for turn in json_line[message_column]: + if column not in turn.keys(): + report_dict["key_value"] = False + report_dict["message"] = ( + f"Missing '{column}' in a turn was found on line {idx + 1} of the the input file." + ) + raise KeyError + else: + if not isinstance(turn[column], str): + report_dict["text_field"] = False + report_dict["message"] = ( + f"Invalid format on line {idx + 1} in the column {column} of the input file. " + f"Expected string. found {type(turn[column])}" + ) + raise KeyError + + roles = set(turn["role"] for turn in json_line["messages"]) + for role in roles: + if role not in POSSIBLE_ROLES_CONVERSATION: report_dict["key_value"] = False report_dict["message"] = ( - f"Missing '{column}' field was found on line {idx + 1} of the the input file." + f"Found invalid role '{role}' in the messages on the line {idx + 1}. " + f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}" ) raise KeyError - else: - if isinstance(json_line[message_column][column], str): - report_dict["text_field"] = False - report_dict["message"] = ( - f"Invalid format on line {idx + 1} in the column {column} of the input file. " - f"Expected string. found {type(json_line[message_column][column])}" - ) - pass + + is_user_turn = [turn["role"] == "user" for turn in json_line["messages"]] + if any(i == j for i, j in zip(is_user_turn[1:], is_user_turn[:1])): + report_dict["key_value"] = False + report_dict["message"] = ( + f"Invalid role turns on line {idx + 1} of the input file. " + "'user' and 'assistant' roles must alternate user/assistant/user/assistant/..." + ) + raise KeyError else: # check to make sure the value of the keys is a string for column in JSONL_REQUIRED_COLUMNS_MAP[dataset_format]: @@ -169,8 +200,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: f'Invalid value type for "{column}" key on line {idx + 1}. ' f'Expected string. Found {type(json_line[column])}.' ) - - report_dict["is_check_passed"] = False + raise KeyError # make sure this is outside the for idx, line in enumerate(f): for loop if idx + 1 < MIN_SAMPLES: @@ -183,6 +213,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: else: report_dict["num_samples"] = idx + 1 report_dict["min_samples"] = True + report_dict["is_check_passed"] = True report_dict["load_json"] = True diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py new file mode 100644 index 00000000..a8b80131 --- /dev/null +++ b/tests/unit/test_files_checks.py @@ -0,0 +1,172 @@ +import pytest +from pathlib import Path +from together.utils.files import check_file + + +def test_check_jsonl_valid_general(tmp_path: Path): + # Create a valid JSONL file + file = tmp_path / "valid.jsonl" + content = [ + '{"text": "Hello, world!"}', + '{"text": "How are you?"}' + ] + file.write_text("\n".join(content), encoding="utf-8") + + report = check_file(file) + + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == 2 + assert report["min_samples"] + + +def test_check_jsonl_valid_instruction(tmp_path: Path): + # Create a valid JSONL file with instruction format + file = tmp_path / "valid_instruction.jsonl" + content = [ + '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}', + '{"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."}' + ] + file.write_text("\n".join(content), encoding="utf-8") + + report = check_file(file) + + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == 2 + assert report["min_samples"] + + +def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path): + # Create a valid JSONL file with conversational format and 1 user-assistant turn pair + file = tmp_path / "valid_conversational_single_turn.jsonl" + content = [ + '{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]}', + '{"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]}' + ] + file.write_text("\n".join(content), encoding="utf-8") + + report = check_file(file) + + print(report) + + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == 2 + assert report["min_samples"] + + +def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path): + # Create a valid JSONL file with conversational format and multiple user-assistant turn pairs + file = tmp_path / "valid_conversational_multiple_turns.jsonl" + content = [ + '{"messages": [{"role": "user", "content": "Is it going to rain today?"}, {"role": "assistant", "content": "Yes, expect showers in the afternoon."}]}', + '{"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}]}' + ] + file.write_text("\n".join(content), encoding="utf-8") + + report = check_file(file) + + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == 2 + assert report["min_samples"] + + +def test_check_jsonl_empty_file(tmp_path: Path): + # Create an empty JSONL file + file = tmp_path / "empty.jsonl" + file.touch() + + report = check_file(file) + + print(report) + + assert not report["is_check_passed"] + assert report["message"] == "File is empty" + assert report["file_size"] == 0 + + +def test_check_jsonl_non_utf8(tmp_path: Path): + # Create a non-UTF-8 encoded JSONL file + file = tmp_path / "non_utf8.jsonl" + file.write_bytes(b'\xff\xfe\xfd') + + report = check_file(file) + + assert not report["is_check_passed"] + assert not report["utf8"] + assert "File is not UTF-8 encoded." in report["message"] + + +def test_check_jsonl_invalid_json(tmp_path: Path): + # Create a JSONL file with invalid JSON + file = tmp_path / "invalid_json.jsonl" + content = [ + '{"text": "Hello, world!"}', + 'Invalid JSON Line' + ] + file.write_text("\n".join(content), encoding="utf-8") + + report = check_file(file) + + assert not report["is_check_passed"] + assert "Error parsing json payload" in report["message"] + + +def test_check_jsonl_missing_required_field(tmp_path: Path): + # Create a JSONL file missing a required field + file = tmp_path / "missing_field.jsonl" + content = [ + '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}', + '{"prompt": "Summarize the text."}' + ] + file.write_text("\n".join(content), encoding="utf-8") + + report = check_file(file) + + assert not report["is_check_passed"] + assert "Missing 'completion' field was found on line 2" in report["message"] + + +def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path): + # Create a JSONL file with inconsistent dataset formats + file = tmp_path / "inconsistent_format.jsonl" + content = [ + '{"messages": [{"role": "user", "content": "Hi"}]}', + '{"text": "How are you?"}' # Missing 'messages' + ] + file.write_text("\n".join(content), encoding="utf-8") + + report = check_file(file) + + assert not report["is_check_passed"] + assert "All samples in the dataset must have the same dataset format" in report["message"] + + +def test_check_jsonl_invalid_role(tmp_path: Path): + # Create a JSONL file with an invalid role + file = tmp_path / "invalid_role.jsonl" + content = [ + '{"messages": [{"role": "invalid_role", "content": "Hi"}]}' + ] + file.write_text("\n".join(content), encoding="utf-8") + + report = check_file(file) + + assert not report["is_check_passed"] + assert "Found invalid role 'invalid_role'" in report["message"] + + +def test_check_jsonl_non_alternating_roles(tmp_path: Path): + # Create a JSONL file with non-alternating user/assistant roles + file = tmp_path / "non_alternating_roles.jsonl" + content = [ + '{"messages": [{"role": "user", "content": "Hi"}, {"role": "user", "content": "Hello again"}]}' + ] + file.write_text("\n".join(content), encoding="utf-8") + + report = check_file(file) + + assert not report["is_check_passed"] + assert "Invalid role turns" in report["message"] From d90b4a545f94a5c592a51b178ed07126c3e13346 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Mon, 4 Nov 2024 16:32:23 +0100 Subject: [PATCH 03/21] add train on inputs flag --- src/together/cli/api/finetune.py | 22 +++++----------------- src/together/cli/api/utils.py | 21 +++++++++++++++++++++ src/together/resources/finetune.py | 6 ++++++ src/together/types/finetune.py | 2 ++ 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index 3fdbd74b..334eb290 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -11,7 +11,7 @@ from tabulate import tabulate from together import Together -from together.cli.api.utils import INT_WITH_MAX +from together.cli.api.utils import BOOL_WITH_AUTO, INT_WITH_MAX from together.utils import finetune_price_to_dollars, log_warn, parse_timestamp from together.types.finetune import DownloadCheckpointType, FinetuneTrainingLimits @@ -93,6 +93,7 @@ def fine_tuning(ctx: click.Context) -> None: default=False, help="Whether to skip the launch confirmation message", ) +@click.option("--train-on-inputs", type=BOOL_WITH_AUTO, default=True, help="Whether to mask the inputs in conversational data") def create( ctx: click.Context, training_file: str, @@ -112,6 +113,7 @@ def create( suffix: str, wandb_api_key: str, confirm: bool, + train_on_inputs: bool | Literal["auto"], ) -> None: """Start fine-tuning""" client: Together = ctx.obj @@ -133,6 +135,7 @@ def create( lora_trainable_modules=lora_trainable_modules, suffix=suffix, wandb_api_key=wandb_api_key, + train_on_inputs=train_on_inputs, ) model_limits: FinetuneTrainingLimits = client.fine_tuning.get_model_limits( @@ -186,22 +189,7 @@ def create( if confirm or click.confirm(_CONFIRMATION_MESSAGE, default=True, show_default=True): response = client.fine_tuning.create( - training_file=training_file, - model=model, - n_epochs=n_epochs, - validation_file=validation_file, - n_evals=n_evals, - n_checkpoints=n_checkpoints, - batch_size=batch_size, - learning_rate=learning_rate, - warmup_ratio=warmup_ratio, - lora=lora, - lora_r=lora_r, - lora_dropout=lora_dropout, - lora_alpha=lora_alpha, - lora_trainable_modules=lora_trainable_modules, - suffix=suffix, - wandb_api_key=wandb_api_key, + **training_args, verbose=True, ) diff --git a/src/together/cli/api/utils.py b/src/together/cli/api/utils.py index 3f85f380..e9a7308f 100644 --- a/src/together/cli/api/utils.py +++ b/src/together/cli/api/utils.py @@ -27,4 +27,25 @@ def convert( ) +class BooleanWithAutoParamType(click.ParamType): + name = "boolean_or_auto" + + def convert( + self, value: str, param: click.Parameter | None, ctx: click.Context | None + ) -> bool | Literal["auto"] | None: + if value == "auto": + return "auto" + try: + return bool(value) + except ValueError: + self.fail( + _("{value!r} is not a valid {boolean_type}.").format( + value=value, boolean_type=self.name + ), + param, + ctx, + ) + + INT_WITH_MAX = AutoIntParamType() +BOOL_WITH_AUTO = BooleanWithAutoParamType() \ No newline at end of file diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 44d74f2b..e6d3c350 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -125,6 +125,7 @@ def create( wandb_api_key: str | None = None, verbose: bool = False, model_limits: FinetuneTrainingLimits | None = None, + train_on_inputs: bool | Literal["auto"] = "auto", ) -> FinetuneResponse: """ Method to initiate a fine-tuning job @@ -154,6 +155,7 @@ def create( Defaults to False. model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning. Defaults to None. + train_on_inputs (bool, optional): Whether to mask the inputs in conversational data. Defaults to "auto". Returns: FinetuneResponse: Object containing information about fine-tuning job. @@ -184,6 +186,7 @@ def create( lora_trainable_modules=lora_trainable_modules, suffix=suffix, wandb_api_key=wandb_api_key, + train_on_inputs=train_on_inputs, ) if verbose: @@ -436,6 +439,7 @@ async def create( wandb_api_key: str | None = None, verbose: bool = False, model_limits: FinetuneTrainingLimits | None = None, + train_on_inputs: bool | Literal["auto"] = "auto" ) -> FinetuneResponse: """ Async method to initiate a fine-tuning job @@ -465,6 +469,7 @@ async def create( Defaults to False. model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning. Defaults to None. + train_on_inputs (bool, optional): Whether to mask the inputs in conversational data. Defaults to "auto". Returns: FinetuneResponse: Object containing information about fine-tuning job. @@ -495,6 +500,7 @@ async def create( lora_trainable_modules=lora_trainable_modules, suffix=suffix, wandb_api_key=wandb_api_key, + train_on_inputs=train_on_inputs, ) if verbose: diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index 2f76c446..917a5143 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -163,6 +163,7 @@ class FinetuneRequest(BaseModel): # weights & biases api key wandb_key: str | None = None training_type: FullTrainingType | LoRATrainingType | None = None + train_on_inputs: bool | Literal["auto"] = "auto" class FinetuneResponse(BaseModel): @@ -230,6 +231,7 @@ class FinetuneResponse(BaseModel): # training file metadata training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines") training_file_size: int | None = Field(None, alias="TrainingFileSize") + train_on_inputs: bool | Literal["auto"] = "auto" @field_validator("training_type") @classmethod From 8be70ac066c796ff5bc76468ec21e7754d5eb367 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Mon, 4 Nov 2024 16:49:52 +0100 Subject: [PATCH 04/21] style --- src/together/cli/api/finetune.py | 7 ++- src/together/cli/api/utils.py | 4 +- src/together/constants.py | 2 +- src/together/resources/finetune.py | 4 +- src/together/utils/files.py | 17 +++++-- tests/unit/test_files_checks.py | 79 ++++++++++++++---------------- 6 files changed, 61 insertions(+), 52 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index 334eb290..fec29a8a 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -93,7 +93,12 @@ def fine_tuning(ctx: click.Context) -> None: default=False, help="Whether to skip the launch confirmation message", ) -@click.option("--train-on-inputs", type=BOOL_WITH_AUTO, default=True, help="Whether to mask the inputs in conversational data") +@click.option( + "--train-on-inputs", + type=BOOL_WITH_AUTO, + default=True, + help="Whether to mask the inputs in conversational data", +) def create( ctx: click.Context, training_file: str, diff --git a/src/together/cli/api/utils.py b/src/together/cli/api/utils.py index e9a7308f..7f80a68b 100644 --- a/src/together/cli/api/utils.py +++ b/src/together/cli/api/utils.py @@ -31,7 +31,7 @@ class BooleanWithAutoParamType(click.ParamType): name = "boolean_or_auto" def convert( - self, value: str, param: click.Parameter | None, ctx: click.Context | None + self, value: str, param: click.Parameter | None, ctx: click.Context | None ) -> bool | Literal["auto"] | None: if value == "auto": return "auto" @@ -48,4 +48,4 @@ def convert( INT_WITH_MAX = AutoIntParamType() -BOOL_WITH_AUTO = BooleanWithAutoParamType() \ No newline at end of file +BOOL_WITH_AUTO = BooleanWithAutoParamType() diff --git a/src/together/constants.py b/src/together/constants.py index 3e3399e5..3d34083d 100644 --- a/src/together/constants.py +++ b/src/together/constants.py @@ -51,4 +51,4 @@ class DatasetFormat(enum.Enum): DatasetFormat.INSTRUCTION: ["prompt", "completion"], } REQUIRED_COLUMNS_CONVERSATION = ["role", "content"] -POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"] \ No newline at end of file +POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"] diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index e6d3c350..40de2a1d 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -43,6 +43,7 @@ def createFinetuneRequest( lora_trainable_modules: str | None = "all-linear", suffix: str | None = None, wandb_api_key: str | None = None, + train_on_inputs: bool | Literal["auto"] = "auto", ) -> FinetuneRequest: if batch_size == "max": log_warn_once( @@ -95,6 +96,7 @@ def createFinetuneRequest( training_type=training_type, suffix=suffix, wandb_key=wandb_api_key, + train_on_inputs=train_on_inputs, ) return finetune_request @@ -439,7 +441,7 @@ async def create( wandb_api_key: str | None = None, verbose: bool = False, model_limits: FinetuneTrainingLimits | None = None, - train_on_inputs: bool | Literal["auto"] = "auto" + train_on_inputs: bool | Literal["auto"] = "auto", ) -> FinetuneResponse: """ Async method to initiate a fine-tuning job diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 5e7bdeae..fd0c99c4 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -16,7 +16,7 @@ JSONL_REQUIRED_COLUMNS_MAP, REQUIRED_COLUMNS_CONVERSATION, POSSIBLE_ROLES_CONVERSATION, - DatasetFormat + DatasetFormat, ) @@ -111,7 +111,10 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: current_format = None for possible_format in JSONL_REQUIRED_COLUMNS_MAP: - if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]): + if all( + column in json_line + for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format] + ): if current_format is None: current_format = possible_format elif current_format != possible_format: @@ -148,7 +151,9 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: raise KeyError if dataset_format == DatasetFormat.CONVERSATION: - message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION][0] + message_column = JSONL_REQUIRED_COLUMNS_MAP[ + DatasetFormat.CONVERSATION + ][0] if not isinstance(json_line[message_column], list): report_dict["key_value"] = False report_dict["message"] = ( @@ -183,7 +188,9 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: ) raise KeyError - is_user_turn = [turn["role"] == "user" for turn in json_line["messages"]] + is_user_turn = [ + turn["role"] == "user" for turn in json_line["messages"] + ] if any(i == j for i, j in zip(is_user_turn[1:], is_user_turn[:1])): report_dict["key_value"] = False report_dict["message"] = ( @@ -198,7 +205,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["key_value"] = False report_dict["message"] = ( f'Invalid value type for "{column}" key on line {idx + 1}. ' - f'Expected string. Found {type(json_line[column])}.' + f"Expected string. Found {type(json_line[column])}." ) raise KeyError diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index a8b80131..2662c6b1 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -6,14 +6,11 @@ def test_check_jsonl_valid_general(tmp_path: Path): # Create a valid JSONL file file = tmp_path / "valid.jsonl" - content = [ - '{"text": "Hello, world!"}', - '{"text": "How are you?"}' - ] + content = ['{"text": "Hello, world!"}', '{"text": "How are you?"}'] file.write_text("\n".join(content), encoding="utf-8") - + report = check_file(file) - + assert report["is_check_passed"] assert report["utf8"] assert report["num_samples"] == 2 @@ -25,12 +22,12 @@ def test_check_jsonl_valid_instruction(tmp_path: Path): file = tmp_path / "valid_instruction.jsonl" content = [ '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}', - '{"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."}' + '{"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."}', ] file.write_text("\n".join(content), encoding="utf-8") - + report = check_file(file) - + assert report["is_check_passed"] assert report["utf8"] assert report["num_samples"] == 2 @@ -42,14 +39,14 @@ def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path): file = tmp_path / "valid_conversational_single_turn.jsonl" content = [ '{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]}', - '{"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]}' + '{"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]}', ] file.write_text("\n".join(content), encoding="utf-8") - + report = check_file(file) - + print(report) - + assert report["is_check_passed"] assert report["utf8"] assert report["num_samples"] == 2 @@ -61,12 +58,12 @@ def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path): file = tmp_path / "valid_conversational_multiple_turns.jsonl" content = [ '{"messages": [{"role": "user", "content": "Is it going to rain today?"}, {"role": "assistant", "content": "Yes, expect showers in the afternoon."}]}', - '{"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}]}' + '{"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}]}', ] file.write_text("\n".join(content), encoding="utf-8") - + report = check_file(file) - + assert report["is_check_passed"] assert report["utf8"] assert report["num_samples"] == 2 @@ -77,11 +74,11 @@ def test_check_jsonl_empty_file(tmp_path: Path): # Create an empty JSONL file file = tmp_path / "empty.jsonl" file.touch() - + report = check_file(file) - + print(report) - + assert not report["is_check_passed"] assert report["message"] == "File is empty" assert report["file_size"] == 0 @@ -90,10 +87,10 @@ def test_check_jsonl_empty_file(tmp_path: Path): def test_check_jsonl_non_utf8(tmp_path: Path): # Create a non-UTF-8 encoded JSONL file file = tmp_path / "non_utf8.jsonl" - file.write_bytes(b'\xff\xfe\xfd') - + file.write_bytes(b"\xff\xfe\xfd") + report = check_file(file) - + assert not report["is_check_passed"] assert not report["utf8"] assert "File is not UTF-8 encoded." in report["message"] @@ -102,14 +99,11 @@ def test_check_jsonl_non_utf8(tmp_path: Path): def test_check_jsonl_invalid_json(tmp_path: Path): # Create a JSONL file with invalid JSON file = tmp_path / "invalid_json.jsonl" - content = [ - '{"text": "Hello, world!"}', - 'Invalid JSON Line' - ] + content = ['{"text": "Hello, world!"}', "Invalid JSON Line"] file.write_text("\n".join(content), encoding="utf-8") - + report = check_file(file) - + assert not report["is_check_passed"] assert "Error parsing json payload" in report["message"] @@ -119,12 +113,12 @@ def test_check_jsonl_missing_required_field(tmp_path: Path): file = tmp_path / "missing_field.jsonl" content = [ '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}', - '{"prompt": "Summarize the text."}' + '{"prompt": "Summarize the text."}', ] file.write_text("\n".join(content), encoding="utf-8") - + report = check_file(file) - + assert not report["is_check_passed"] assert "Missing 'completion' field was found on line 2" in report["message"] @@ -134,26 +128,27 @@ def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path): file = tmp_path / "inconsistent_format.jsonl" content = [ '{"messages": [{"role": "user", "content": "Hi"}]}', - '{"text": "How are you?"}' # Missing 'messages' + '{"text": "How are you?"}', # Missing 'messages' ] file.write_text("\n".join(content), encoding="utf-8") - + report = check_file(file) - + assert not report["is_check_passed"] - assert "All samples in the dataset must have the same dataset format" in report["message"] + assert ( + "All samples in the dataset must have the same dataset format" + in report["message"] + ) def test_check_jsonl_invalid_role(tmp_path: Path): # Create a JSONL file with an invalid role file = tmp_path / "invalid_role.jsonl" - content = [ - '{"messages": [{"role": "invalid_role", "content": "Hi"}]}' - ] + content = ['{"messages": [{"role": "invalid_role", "content": "Hi"}]}'] file.write_text("\n".join(content), encoding="utf-8") - + report = check_file(file) - + assert not report["is_check_passed"] assert "Found invalid role 'invalid_role'" in report["message"] @@ -165,8 +160,8 @@ def test_check_jsonl_non_alternating_roles(tmp_path: Path): '{"messages": [{"role": "user", "content": "Hi"}, {"role": "user", "content": "Hello again"}]}' ] file.write_text("\n".join(content), encoding="utf-8") - + report = check_file(file) - + assert not report["is_check_passed"] assert "Invalid role turns" in report["message"] From a5d666a14cf8a2d08a88ef12b1eb62fe5c8db4e4 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Thu, 7 Nov 2024 14:00:03 +0100 Subject: [PATCH 05/21] PR feedback --- src/together/cli/api/finetune.py | 4 +- src/together/cli/api/utils.py | 4 +- src/together/constants.py | 8 +-- src/together/utils/files.py | 107 ++++++++++++++++--------------- tests/unit/test_files_checks.py | 80 +++++++++++++---------- 5 files changed, 106 insertions(+), 97 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index fec29a8a..4d4896b2 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -96,8 +96,8 @@ def fine_tuning(ctx: click.Context) -> None: @click.option( "--train-on-inputs", type=BOOL_WITH_AUTO, - default=True, - help="Whether to mask the inputs in conversational data", + default="auto", + help="Whether to mask the user messages in conversational data or prompts in instruction data", ) def create( ctx: click.Context, diff --git a/src/together/cli/api/utils.py b/src/together/cli/api/utils.py index 7f80a68b..08dfe492 100644 --- a/src/together/cli/api/utils.py +++ b/src/together/cli/api/utils.py @@ -39,8 +39,8 @@ def convert( return bool(value) except ValueError: self.fail( - _("{value!r} is not a valid {boolean_type}.").format( - value=value, boolean_type=self.name + _("{value!r} is not a valid {type}.").format( + value=value, type=self.name ), param, ctx, diff --git a/src/together/constants.py b/src/together/constants.py index 3d34083d..c64af326 100644 --- a/src/together/constants.py +++ b/src/together/constants.py @@ -34,11 +34,7 @@ class DatasetFormat(enum.Enum): - """Dataset format enum. - - Args: - enum (enum.Enum): Enum class for dataset format. - """ + """Dataset format enum.""" GENERAL = "general" CONVERSATION = "conversation" @@ -50,5 +46,5 @@ class DatasetFormat(enum.Enum): DatasetFormat.CONVERSATION: ["messages"], DatasetFormat.INSTRUCTION: ["prompt", "completion"], } -REQUIRED_COLUMNS_CONVERSATION = ["role", "content"] +REQUIRED_COLUMNS_MESSAGE = ["role", "content"] POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"] diff --git a/src/together/utils/files.py b/src/together/utils/files.py index fd0c99c4..a3537ba8 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -14,12 +14,19 @@ NUM_BYTES_IN_GB, PARQUET_EXPECTED_COLUMNS, JSONL_REQUIRED_COLUMNS_MAP, - REQUIRED_COLUMNS_CONVERSATION, + REQUIRED_COLUMNS_MESSAGE, POSSIBLE_ROLES_CONVERSATION, DatasetFormat, ) +class InvalidFileFormatError(Exception): + """Exception raised for invalid file formats during file checks.""" + + def __init__(self) -> None: + super().__init__() + + def check_file( file: Path | str, ) -> Dict[str, Any]: @@ -108,9 +115,11 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: ) report_dict["is_check_passed"] = False + raise InvalidFileFormatError current_format = None for possible_format in JSONL_REQUIRED_COLUMNS_MAP: + # Check if every column in the data contains in if all( column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format] @@ -122,35 +131,16 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: "Found multiple dataset formats in the input file. " f"Got {current_format} and {possible_format} on line {idx + 1}." ) - raise KeyError - if current_format is None and dataset_format is None: + raise InvalidFileFormatError + + if current_format is None: report_dict["message"] = ( - "Error parsing file. Could not detect a possible format for the line with the columns:\n" + f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n" f"{json_line.keys()}" ) - raise KeyError + raise InvalidFileFormatError - if dataset_format is None: - dataset_format = current_format - elif current_format is not None: - if current_format != dataset_format: - report_dict["message"] = ( - "All samples in the dataset must have the same dataset format. " - f"Got {dataset_format} for the first line and {current_format} " - f"for the {idx + 1} line." - ) - raise KeyError - - for column in JSONL_REQUIRED_COLUMNS_MAP[dataset_format]: - if column not in json_line.keys(): - report_dict["text_field"] = False - report_dict["message"] = ( - f"Missing '{column}' field was found on line {idx + 1} of the the input file." - ) - report_dict["is_check_passed"] = False - raise KeyError - - if dataset_format == DatasetFormat.CONVERSATION: + if current_format == DatasetFormat.CONVERSATION: message_column = JSONL_REQUIRED_COLUMNS_MAP[ DatasetFormat.CONVERSATION ][0] @@ -158,56 +148,69 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["key_value"] = False report_dict["message"] = ( f"Invalid format on line {idx + 1} of the input file. " - f"Expected list. Found {type(json_line[message_column])}" + f"Expected a list of messages. Found {type(json_line[message_column])}" ) - raise KeyError - for column in REQUIRED_COLUMNS_CONVERSATION: - for turn in json_line[message_column]: - if column not in turn.keys(): + raise InvalidFileFormatError + + previous_role = "" + for turn in json_line[message_column]: + for column in REQUIRED_COLUMNS_MESSAGE: + if column not in turn: report_dict["key_value"] = False report_dict["message"] = ( - f"Missing '{column}' in a turn was found on line {idx + 1} of the the input file." + f"Field '{column}' is missing for a turn `{turn}` on line {idx + 1} " + "of the the input file." ) - raise KeyError + raise InvalidFileFormatError else: if not isinstance(turn[column], str): report_dict["text_field"] = False report_dict["message"] = ( - f"Invalid format on line {idx + 1} in the column {column} of the input file. " - f"Expected string. found {type(turn[column])}" + f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` " + f"of the input file. Expected string. Found {type(turn[column])}" ) - raise KeyError + raise InvalidFileFormatError + role = turn["role"] - roles = set(turn["role"] for turn in json_line["messages"]) - for role in roles: if role not in POSSIBLE_ROLES_CONVERSATION: report_dict["key_value"] = False report_dict["message"] = ( f"Found invalid role '{role}' in the messages on the line {idx + 1}. " f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}" ) - raise KeyError + raise InvalidFileFormatError + + if previous_role == role: + report_dict["key_value"] = False + report_dict["message"] = ( + f"Invalid role turns on line {idx + 1} of the input file. " + "'user' and 'assistant' roles must alternate user/assistant/user/assistant/..." + ) + raise InvalidFileFormatError + + previous_role = role - is_user_turn = [ - turn["role"] == "user" for turn in json_line["messages"] - ] - if any(i == j for i, j in zip(is_user_turn[1:], is_user_turn[:1])): - report_dict["key_value"] = False - report_dict["message"] = ( - f"Invalid role turns on line {idx + 1} of the input file. " - "'user' and 'assistant' roles must alternate user/assistant/user/assistant/..." - ) - raise KeyError else: # check to make sure the value of the keys is a string - for column in JSONL_REQUIRED_COLUMNS_MAP[dataset_format]: + for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]: if not isinstance(json_line[column], str): report_dict["key_value"] = False report_dict["message"] = ( f'Invalid value type for "{column}" key on line {idx + 1}. ' f"Expected string. Found {type(json_line[column])}." ) - raise KeyError + raise InvalidFileFormatError + + if dataset_format is None: + dataset_format = current_format + elif current_format is not None: + if current_format != dataset_format: + report_dict["message"] = ( + "All samples in the dataset must have the same dataset format. " + f"Got {dataset_format} for the first line and {current_format} " + f"for the line {idx + 1}." + ) + raise InvalidFileFormatError # make sure this is outside the for idx, line in enumerate(f): for loop if idx + 1 < MIN_SAMPLES: @@ -236,7 +239,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: f"Error parsing json payload. Unexpected format on line {idx + 1}." ) report_dict["is_check_passed"] = False - except KeyError as _: + except InvalidFileFormatError: report_dict["load_json"] = False report_dict["is_check_passed"] = False diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index 2662c6b1..f83533bc 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -1,73 +1,79 @@ +import json import pytest from pathlib import Path + +from together.constants import MIN_SAMPLES from together.utils.files import check_file def test_check_jsonl_valid_general(tmp_path: Path): # Create a valid JSONL file file = tmp_path / "valid.jsonl" - content = ['{"text": "Hello, world!"}', '{"text": "How are you?"}'] - file.write_text("\n".join(content), encoding="utf-8") + content = [{"text": "Hello, world!"}, {"text": "How are you?"}] + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) report = check_file(file) assert report["is_check_passed"] assert report["utf8"] - assert report["num_samples"] == 2 - assert report["min_samples"] + assert report["num_samples"] == len(content) + assert report["min_samples"] >= MIN_SAMPLES def test_check_jsonl_valid_instruction(tmp_path: Path): # Create a valid JSONL file with instruction format file = tmp_path / "valid_instruction.jsonl" content = [ - '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}', - '{"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."}', + {"prompt": "Translate the following sentence.", "completion": "Hello, world!"}, + {"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."}, ] - file.write_text("\n".join(content), encoding="utf-8") + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) report = check_file(file) assert report["is_check_passed"] assert report["utf8"] - assert report["num_samples"] == 2 - assert report["min_samples"] + assert report["num_samples"] == len(content) + assert report["min_samples"] >= MIN_SAMPLES def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path): # Create a valid JSONL file with conversational format and 1 user-assistant turn pair file = tmp_path / "valid_conversational_single_turn.jsonl" content = [ - '{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]}', - '{"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]}', + {"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]}, + {"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]}, ] - file.write_text("\n".join(content), encoding="utf-8") + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) report = check_file(file) - print(report) assert report["is_check_passed"] assert report["utf8"] - assert report["num_samples"] == 2 - assert report["min_samples"] + assert report["num_samples"] == len(content) + assert report["min_samples"] >= MIN_SAMPLES def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path): # Create a valid JSONL file with conversational format and multiple user-assistant turn pairs file = tmp_path / "valid_conversational_multiple_turns.jsonl" content = [ - '{"messages": [{"role": "user", "content": "Is it going to rain today?"}, {"role": "assistant", "content": "Yes, expect showers in the afternoon."}]}', - '{"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}]}', + {"messages": [{"role": "user", "content": "Is it going to rain today?"}, {"role": "assistant", "content": "Yes, expect showers in the afternoon."}, {"role": "user", "content": "What is the weather like in Tokyo?"}, {"role": "assistant", "content": "It is sunny with a chance of rain."}]}, + {"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}, {"role": "user", "content": "What is the weather like in Amsterdam?"}, {"role": "assistant", "content": "It is cloudy with a chance of snow."}]}, ] - file.write_text("\n".join(content), encoding="utf-8") + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) report = check_file(file) assert report["is_check_passed"] assert report["utf8"] - assert report["num_samples"] == 2 - assert report["min_samples"] + assert report["num_samples"] == len(content) + assert report["min_samples"] >= MIN_SAMPLES def test_check_jsonl_empty_file(tmp_path: Path): @@ -77,7 +83,6 @@ def test_check_jsonl_empty_file(tmp_path: Path): report = check_file(file) - print(report) assert not report["is_check_passed"] assert report["message"] == "File is empty" @@ -99,38 +104,41 @@ def test_check_jsonl_non_utf8(tmp_path: Path): def test_check_jsonl_invalid_json(tmp_path: Path): # Create a JSONL file with invalid JSON file = tmp_path / "invalid_json.jsonl" - content = ['{"text": "Hello, world!"}', "Invalid JSON Line"] - file.write_text("\n".join(content), encoding="utf-8") + content = [{"text": "Hello, world!"}, "Invalid JSON Line"] + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) report = check_file(file) assert not report["is_check_passed"] - assert "Error parsing json payload" in report["message"] + assert "Error parsing file." in report["message"] def test_check_jsonl_missing_required_field(tmp_path: Path): # Create a JSONL file missing a required field file = tmp_path / "missing_field.jsonl" content = [ - '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}', - '{"prompt": "Summarize the text."}', + {"prompt": "Translate the following sentence.", "completion": "Hello, world!"}, + {"prompt": "Summarize the text."}, ] - file.write_text("\n".join(content), encoding="utf-8") + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) report = check_file(file) assert not report["is_check_passed"] - assert "Missing 'completion' field was found on line 2" in report["message"] + assert "Error parsing file. Could not detect a format for the line 2" in report["message"] def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path): # Create a JSONL file with inconsistent dataset formats file = tmp_path / "inconsistent_format.jsonl" content = [ - '{"messages": [{"role": "user", "content": "Hi"}]}', - '{"text": "How are you?"}', # Missing 'messages' + {"messages": [{"role": "user", "content": "Hi"}]}, + {"text": "How are you?"}, # Missing 'messages' ] - file.write_text("\n".join(content), encoding="utf-8") + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) report = check_file(file) @@ -144,8 +152,9 @@ def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path): def test_check_jsonl_invalid_role(tmp_path: Path): # Create a JSONL file with an invalid role file = tmp_path / "invalid_role.jsonl" - content = ['{"messages": [{"role": "invalid_role", "content": "Hi"}]}'] - file.write_text("\n".join(content), encoding="utf-8") + content = [{"messages": [{"role": "invalid_role", "content": "Hi"}]}] + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) report = check_file(file) @@ -157,9 +166,10 @@ def test_check_jsonl_non_alternating_roles(tmp_path: Path): # Create a JSONL file with non-alternating user/assistant roles file = tmp_path / "non_alternating_roles.jsonl" content = [ - '{"messages": [{"role": "user", "content": "Hi"}, {"role": "user", "content": "Hello again"}]}' + {"messages": [{"role": "user", "content": "Hi"}, {"role": "user", "content": "Hello again"}]} ] - file.write_text("\n".join(content), encoding="utf-8") + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) report = check_file(file) From ff47c0266edad7a634e4d9d02e9458ae359b4327 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Thu, 7 Nov 2024 14:03:21 +0100 Subject: [PATCH 06/21] style --- tests/unit/test_files_checks.py | 49 +++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index f83533bc..6b497292 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -43,15 +43,24 @@ def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path): # Create a valid JSONL file with conversational format and 1 user-assistant turn pair file = tmp_path / "valid_conversational_single_turn.jsonl" content = [ - {"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]}, - {"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]}, + { + "messages": [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + ] + }, + { + "messages": [ + {"role": "user", "content": "How are you?"}, + {"role": "assistant", "content": "I am fine."}, + ] + }, ] with file.open("w") as f: f.write("\n".join([json.dumps(item) for item in content])) report = check_file(file) - assert report["is_check_passed"] assert report["utf8"] assert report["num_samples"] == len(content) @@ -62,8 +71,25 @@ def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path): # Create a valid JSONL file with conversational format and multiple user-assistant turn pairs file = tmp_path / "valid_conversational_multiple_turns.jsonl" content = [ - {"messages": [{"role": "user", "content": "Is it going to rain today?"}, {"role": "assistant", "content": "Yes, expect showers in the afternoon."}, {"role": "user", "content": "What is the weather like in Tokyo?"}, {"role": "assistant", "content": "It is sunny with a chance of rain."}]}, - {"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}, {"role": "user", "content": "What is the weather like in Amsterdam?"}, {"role": "assistant", "content": "It is cloudy with a chance of snow."}]}, + { + "messages": [ + {"role": "user", "content": "Is it going to rain today?"}, + { + "role": "assistant", + "content": "Yes, expect showers in the afternoon.", + }, + {"role": "user", "content": "What is the weather like in Tokyo?"}, + {"role": "assistant", "content": "It is sunny with a chance of rain."}, + ] + }, + { + "messages": [ + {"role": "user", "content": "Who won the game last night?"}, + {"role": "assistant", "content": "The home team won by two points."}, + {"role": "user", "content": "What is the weather like in Amsterdam?"}, + {"role": "assistant", "content": "It is cloudy with a chance of snow."}, + ] + }, ] with file.open("w") as f: f.write("\n".join([json.dumps(item) for item in content])) @@ -83,7 +109,6 @@ def test_check_jsonl_empty_file(tmp_path: Path): report = check_file(file) - assert not report["is_check_passed"] assert report["message"] == "File is empty" assert report["file_size"] == 0 @@ -127,7 +152,10 @@ def test_check_jsonl_missing_required_field(tmp_path: Path): report = check_file(file) assert not report["is_check_passed"] - assert "Error parsing file. Could not detect a format for the line 2" in report["message"] + assert ( + "Error parsing file. Could not detect a format for the line 2" + in report["message"] + ) def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path): @@ -166,7 +194,12 @@ def test_check_jsonl_non_alternating_roles(tmp_path: Path): # Create a JSONL file with non-alternating user/assistant roles file = tmp_path / "non_alternating_roles.jsonl" content = [ - {"messages": [{"role": "user", "content": "Hi"}, {"role": "user", "content": "Hello again"}]} + { + "messages": [ + {"role": "user", "content": "Hi"}, + {"role": "user", "content": "Hello again"}, + ] + } ] with file.open("w") as f: f.write("\n".join([json.dumps(item) for item in content])) From 8a6b63dc84ddc47b0778b0d6a5b1280207527148 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Thu, 7 Nov 2024 15:37:08 +0100 Subject: [PATCH 07/21] more tests --- src/together/utils/files.py | 2 +- tests/unit/test_files_checks.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/together/utils/files.py b/src/together/utils/files.py index a3537ba8..091ff15f 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -227,7 +227,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["load_json"] = True - except ValueError as _: + except ValueError: report_dict["load_json"] = False if idx < 0: report_dict["message"] = ( diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index 6b497292..7cfdac3e 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -208,3 +208,33 @@ def test_check_jsonl_non_alternating_roles(tmp_path: Path): assert not report["is_check_passed"] assert "Invalid role turns" in report["message"] + + +def test_check_jsonl_invalid_value_type(tmp_path: Path): + # Create a JSONL file with an invalid value type + file = tmp_path / "invalid_value_type.jsonl" + content = [{"text": 123}] + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) + + report = check_file(file) + assert not report["is_check_passed"] + assert "Expected string" in report["message"] + + +def test_check_jsonl_missing_field_in_conversation(tmp_path: Path): + file = tmp_path / "missing_field_in_conversation.jsonl" + content = [ + { + "messages": [ + {"role": "user", "content": "Hi"}, + {"role": "assistant"}, + ] + } + ] + with file.open("w") as f: + f.write("\n".join([json.dumps(item) for item in content])) + + report = check_file(file) + assert not report["is_check_passed"] + assert "Field 'content' is missing for a turn" in report["message"] From ad9d0a877aa195972331262254144143b5827ba0 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Thu, 7 Nov 2024 15:56:29 +0100 Subject: [PATCH 08/21] enhance logic --- src/together/resources/finetune.py | 3 +- src/together/utils/files.py | 117 +++++++++++++++++------------ 2 files changed, 69 insertions(+), 51 deletions(-) diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 40de2a1d..0148b50f 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -157,7 +157,8 @@ def create( Defaults to False. model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning. Defaults to None. - train_on_inputs (bool, optional): Whether to mask the inputs in conversational data. Defaults to "auto". + train_on_inputs (bool, optional): Whether to mask the user messages in conversational data or prompts in instruction data. + Defaults to "auto". Returns: FinetuneResponse: Object containing information about fine-tuning job. diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 091ff15f..6a45511c 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -23,8 +23,16 @@ class InvalidFileFormatError(Exception): """Exception raised for invalid file formats during file checks.""" - def __init__(self) -> None: - super().__init__() + def __init__( + self, + message: str = "", + line_number: int | None = None, + field: str | None = None, + ) -> None: + super().__init__(message) + self.message = message + self.line_number = line_number + self.field = field def check_file( @@ -101,25 +109,23 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: dataset_format = None with file.open() as f: - # idx must be instantiated so decode errors (e.g. file is a tar) or empty files are caught idx = -1 try: for idx, line in enumerate(f): - json_line = json.loads(line) # each line in jsonlines should be a json + json_line = json.loads(line) if not isinstance(json_line, dict): - report_dict["line_type"] = False - report_dict["message"] = ( - f"Error parsing file. Invalid format on line {idx + 1} of the input file. " - 'Example of valid json: {"text": "my sample string"}. ' + raise InvalidFileFormatError( + message=( + f"Error parsing file. Invalid format on line {idx + 1} of the input file. " + 'Example of valid json: {"text": "my sample string"}. ' + ), + line_number=idx + 1, + field="line_type", ) - report_dict["is_check_passed"] = False - raise InvalidFileFormatError - current_format = None for possible_format in JSONL_REQUIRED_COLUMNS_MAP: - # Check if every column in the data contains in if all( column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format] @@ -127,71 +133,72 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: if current_format is None: current_format = possible_format elif current_format != possible_format: - report_dict["message"] = ( - "Found multiple dataset formats in the input file. " - f"Got {current_format} and {possible_format} on line {idx + 1}." + raise InvalidFileFormatError( + message="Found multiple dataset formats in the input file. " + f"Got {current_format} and {possible_format} on line {idx + 1}.", + line_number=idx + 1, ) - raise InvalidFileFormatError if current_format is None: - report_dict["message"] = ( - f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n" - f"{json_line.keys()}" + raise InvalidFileFormatError( + message=( + f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n" + f"{json_line.keys()}" + ), + line_number=idx + 1, ) - raise InvalidFileFormatError if current_format == DatasetFormat.CONVERSATION: message_column = JSONL_REQUIRED_COLUMNS_MAP[ DatasetFormat.CONVERSATION ][0] if not isinstance(json_line[message_column], list): - report_dict["key_value"] = False - report_dict["message"] = ( - f"Invalid format on line {idx + 1} of the input file. " - f"Expected a list of messages. Found {type(json_line[message_column])}" + raise InvalidFileFormatError( + message=f"Invalid format on line {idx + 1} of the input file. " + f"Expected a list of messages. Found {type(json_line[message_column])}", + line_number=idx + 1, + field="key_value", ) - raise InvalidFileFormatError previous_role = "" for turn in json_line[message_column]: for column in REQUIRED_COLUMNS_MESSAGE: if column not in turn: - report_dict["key_value"] = False - report_dict["message"] = ( - f"Field '{column}' is missing for a turn `{turn}` on line {idx + 1} " - "of the the input file." + raise InvalidFileFormatError( + message=f"Field '{column}' is missing for a turn `{turn}` on line {idx + 1} " + "of the the input file.", + line_number=idx + 1, + field="key_value", ) - raise InvalidFileFormatError else: if not isinstance(turn[column], str): - report_dict["text_field"] = False - report_dict["message"] = ( - f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` " - f"of the input file. Expected string. Found {type(turn[column])}" + raise InvalidFileFormatError( + message=f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` " + f"of the input file. Expected string. Found {type(turn[column])}", + line_number=idx + 1, + field="text_field", ) - raise InvalidFileFormatError role = turn["role"] if role not in POSSIBLE_ROLES_CONVERSATION: - report_dict["key_value"] = False - report_dict["message"] = ( - f"Found invalid role '{role}' in the messages on the line {idx + 1}. " - f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}" + raise InvalidFileFormatError( + message=f"Found invalid role '{role}' in the messages on the line {idx + 1}. " + f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}", + line_number=idx + 1, + field="key_value", ) - raise InvalidFileFormatError if previous_role == role: - report_dict["key_value"] = False - report_dict["message"] = ( - f"Invalid role turns on line {idx + 1} of the input file. " - "'user' and 'assistant' roles must alternate user/assistant/user/assistant/..." + raise InvalidFileFormatError( + message=f"Invalid role turns on line {idx + 1} of the input file. " + "'user' and 'assistant' roles must alternate user/assistant/user/assistant/...", + line_number=idx + 1, + field="key_value", ) - raise InvalidFileFormatError previous_role = role else: - # check to make sure the value of the keys is a string for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]: if not isinstance(json_line[column], str): report_dict["key_value"] = False @@ -199,7 +206,11 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: f'Invalid value type for "{column}" key on line {idx + 1}. ' f"Expected string. Found {type(json_line[column])}." ) - raise InvalidFileFormatError + raise InvalidFileFormatError( + message=report_dict["message"], + line_number=idx + 1, + field="key_value", + ) if dataset_format is None: dataset_format = current_format @@ -210,9 +221,10 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: f"Got {dataset_format} for the first line and {current_format} " f"for the line {idx + 1}." ) - raise InvalidFileFormatError + raise InvalidFileFormatError( + message=report_dict["message"], line_number=idx + 1 + ) - # make sure this is outside the for idx, line in enumerate(f): for loop if idx + 1 < MIN_SAMPLES: report_dict["min_samples"] = False report_dict["message"] = ( @@ -239,9 +251,14 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: f"Error parsing json payload. Unexpected format on line {idx + 1}." ) report_dict["is_check_passed"] = False - except InvalidFileFormatError: + except InvalidFileFormatError as e: report_dict["load_json"] = False report_dict["is_check_passed"] = False + report_dict["message"] = e.message + if e.line_number is not None: + report_dict["line_number"] = e.line_number + if e.field is not None: + report_dict[e.field] = False if "text_field" not in report_dict: report_dict["text_field"] = True From 487fbae7bd1f7c6df03f69608bb03f97a654c847 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Thu, 7 Nov 2024 15:58:51 +0100 Subject: [PATCH 09/21] enhance logic --- src/together/utils/files.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 6a45511c..cc8c7b7d 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -201,13 +201,9 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: else: for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]: if not isinstance(json_line[column], str): - report_dict["key_value"] = False - report_dict["message"] = ( - f'Invalid value type for "{column}" key on line {idx + 1}. ' - f"Expected string. Found {type(json_line[column])}." - ) raise InvalidFileFormatError( - message=report_dict["message"], + message=f'Invalid value type for "{column}" key on line {idx + 1}. ' + f"Expected string. Found {type(json_line[column])}.", line_number=idx + 1, field="key_value", ) @@ -216,13 +212,11 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: dataset_format = current_format elif current_format is not None: if current_format != dataset_format: - report_dict["message"] = ( - "All samples in the dataset must have the same dataset format. " - f"Got {dataset_format} for the first line and {current_format} " - f"for the line {idx + 1}." - ) raise InvalidFileFormatError( - message=report_dict["message"], line_number=idx + 1 + message="All samples in the dataset must have the same dataset format. " + f"Got {dataset_format} for the first line and {current_format} " + f"for the line {idx + 1}.", + line_number=idx + 1, ) if idx + 1 < MIN_SAMPLES: From a400517ee6a12b9fc4a43907f09a3efde7f8e031 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Fri, 8 Nov 2024 16:26:06 +0100 Subject: [PATCH 10/21] pr feedback part 1 --- src/together/resources/finetune.py | 10 ++++--- src/together/utils/files.py | 42 ++++++++++++++++-------------- tests/unit/test_files_checks.py | 32 +++++++++++------------ 3 files changed, 46 insertions(+), 38 deletions(-) diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 0148b50f..3c17c6ad 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -140,7 +140,7 @@ def create( n_evals (int, optional): Number of evaluation loops to run. Defaults to 0. n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning. Defaults to 1. - batch_size (int, optional): Batch size for fine-tuning. Defaults to max. + batch_size (int or "max"): Batch size for fine-tuning. Defaults to max. learning_rate (float, optional): Learning rate multiplier to use for training Defaults to 0.00001. warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. @@ -157,7 +157,11 @@ def create( Defaults to False. model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning. Defaults to None. - train_on_inputs (bool, optional): Whether to mask the user messages in conversational data or prompts in instruction data. + train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data. + "auto" will automatically determine whether to mask the inputs based on the data format. + Dataset with "text" (General format) field will not mask the inputs by default. + Dataset with "messages" (Conversational format) or "prompt" and "completion" (Instruction format) + fields will mask the inputs by default. Defaults to "auto". Returns: @@ -472,7 +476,7 @@ async def create( Defaults to False. model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning. Defaults to None. - train_on_inputs (bool, optional): Whether to mask the inputs in conversational data. Defaults to "auto". + train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data. Returns: FinetuneResponse: Object containing information about fine-tuning job. diff --git a/src/together/utils/files.py b/src/together/utils/files.py index cc8c7b7d..c382d3bb 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -20,19 +20,19 @@ ) -class InvalidFileFormatError(Exception): +class InvalidFileFormatError(ValueError): """Exception raised for invalid file formats during file checks.""" def __init__( self, message: str = "", line_number: int | None = None, - field: str | None = None, + error_source: str | None = None, ) -> None: super().__init__(message) self.message = message self.line_number = line_number - self.field = field + self.error_source = error_source def check_file( @@ -50,7 +50,7 @@ def check_file( "line_type": None, "text_field": None, "key_value": None, - "min_samples": None, + "has_min_samples": None, "num_samples": None, "load_json": None, } @@ -121,7 +121,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: 'Example of valid json: {"text": "my sample string"}. ' ), line_number=idx + 1, - field="line_type", + error_source="line_type", ) current_format = None @@ -137,6 +137,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: message="Found multiple dataset formats in the input file. " f"Got {current_format} and {possible_format} on line {idx + 1}.", line_number=idx + 1, + error_source="format", ) if current_format is None: @@ -146,6 +147,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: f"{json_line.keys()}" ), line_number=idx + 1, + error_source="format", ) if current_format == DatasetFormat.CONVERSATION: @@ -157,7 +159,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: message=f"Invalid format on line {idx + 1} of the input file. " f"Expected a list of messages. Found {type(json_line[message_column])}", line_number=idx + 1, - field="key_value", + error_source="key_value", ) previous_role = "" @@ -165,10 +167,10 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: for column in REQUIRED_COLUMNS_MESSAGE: if column not in turn: raise InvalidFileFormatError( - message=f"Field '{column}' is missing for a turn `{turn}` on line {idx + 1} " + message=f"Field `{column}` is missing for a turn `{turn}` on line {idx + 1} " "of the the input file.", line_number=idx + 1, - field="key_value", + error_source="key_value", ) else: if not isinstance(turn[column], str): @@ -176,24 +178,24 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: message=f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` " f"of the input file. Expected string. Found {type(turn[column])}", line_number=idx + 1, - field="text_field", + error_source="text_field", ) role = turn["role"] if role not in POSSIBLE_ROLES_CONVERSATION: raise InvalidFileFormatError( - message=f"Found invalid role '{role}' in the messages on the line {idx + 1}. " + message=f"Found invalid role `{role}` in the messages on the line {idx + 1}. " f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}", line_number=idx + 1, - field="key_value", + error_source="key_value", ) if previous_role == role: raise InvalidFileFormatError( message=f"Invalid role turns on line {idx + 1} of the input file. " - "'user' and 'assistant' roles must alternate user/assistant/user/assistant/...", + "`user` and `assistant` roles must alternate user/assistant/user/assistant/...", line_number=idx + 1, - field="key_value", + error_source="key_value", ) previous_role = role @@ -205,7 +207,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: message=f'Invalid value type for "{column}" key on line {idx + 1}. ' f"Expected string. Found {type(json_line[column])}.", line_number=idx + 1, - field="key_value", + error_source="key_value", ) if dataset_format is None: @@ -217,10 +219,11 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: f"Got {dataset_format} for the first line and {current_format} " f"for the line {idx + 1}.", line_number=idx + 1, + error_source="format", ) if idx + 1 < MIN_SAMPLES: - report_dict["min_samples"] = False + report_dict["has_min_samples"] = False report_dict["message"] = ( f"Processing {file} resulted in only {idx + 1} samples. " f"Our minimum is {MIN_SAMPLES} samples. " @@ -228,7 +231,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["is_check_passed"] = False else: report_dict["num_samples"] = idx + 1 - report_dict["min_samples"] = True + report_dict["has_min_samples"] = True report_dict["is_check_passed"] = True report_dict["load_json"] = True @@ -251,8 +254,8 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["message"] = e.message if e.line_number is not None: report_dict["line_number"] = e.line_number - if e.field is not None: - report_dict[e.field] = False + if e.error_source is not None: + report_dict[e.error_source] = False if "text_field" not in report_dict: report_dict["text_field"] = True @@ -295,7 +298,8 @@ def _check_parquet(file: Path) -> Dict[str, Any]: num_samples = len(table) if num_samples < MIN_SAMPLES: - report_dict["min_samples"] = ( + report_dict["has_min_samples"] = False + report_dict["message"] = ( f"Processing {file} resulted in only {num_samples} samples. " f"Our minimum is {MIN_SAMPLES} samples. " ) diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index 7cfdac3e..0412be64 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -11,14 +11,14 @@ def test_check_jsonl_valid_general(tmp_path: Path): file = tmp_path / "valid.jsonl" content = [{"text": "Hello, world!"}, {"text": "How are you?"}] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) assert report["is_check_passed"] assert report["utf8"] assert report["num_samples"] == len(content) - assert report["min_samples"] >= MIN_SAMPLES + assert report["has_min_samples"] def test_check_jsonl_valid_instruction(tmp_path: Path): @@ -26,17 +26,17 @@ def test_check_jsonl_valid_instruction(tmp_path: Path): file = tmp_path / "valid_instruction.jsonl" content = [ {"prompt": "Translate the following sentence.", "completion": "Hello, world!"}, - {"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."}, + {"prompt": "Summarize the text.", "completion": "Weyland-Yutani Corporation creates advanced AI."}, ] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) assert report["is_check_passed"] assert report["utf8"] assert report["num_samples"] == len(content) - assert report["min_samples"] >= MIN_SAMPLES + assert report["has_min_samples"] def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path): @@ -57,14 +57,14 @@ def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path): }, ] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) assert report["is_check_passed"] assert report["utf8"] assert report["num_samples"] == len(content) - assert report["min_samples"] >= MIN_SAMPLES + assert report["has_min_samples"] def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path): @@ -92,14 +92,14 @@ def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path): }, ] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) assert report["is_check_passed"] assert report["utf8"] assert report["num_samples"] == len(content) - assert report["min_samples"] >= MIN_SAMPLES + assert report["has_min_samples"] def test_check_jsonl_empty_file(tmp_path: Path): @@ -131,7 +131,7 @@ def test_check_jsonl_invalid_json(tmp_path: Path): file = tmp_path / "invalid_json.jsonl" content = [{"text": "Hello, world!"}, "Invalid JSON Line"] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) @@ -147,7 +147,7 @@ def test_check_jsonl_missing_required_field(tmp_path: Path): {"prompt": "Summarize the text."}, ] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) @@ -166,7 +166,7 @@ def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path): {"text": "How are you?"}, # Missing 'messages' ] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) @@ -182,7 +182,7 @@ def test_check_jsonl_invalid_role(tmp_path: Path): file = tmp_path / "invalid_role.jsonl" content = [{"messages": [{"role": "invalid_role", "content": "Hi"}]}] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) @@ -202,7 +202,7 @@ def test_check_jsonl_non_alternating_roles(tmp_path: Path): } ] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) @@ -215,7 +215,7 @@ def test_check_jsonl_invalid_value_type(tmp_path: Path): file = tmp_path / "invalid_value_type.jsonl" content = [{"text": 123}] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) assert not report["is_check_passed"] @@ -233,7 +233,7 @@ def test_check_jsonl_missing_field_in_conversation(tmp_path: Path): } ] with file.open("w") as f: - f.write("\n".join([json.dumps(item) for item in content])) + f.write("\n".join(json.dumps(item) for item in content)) report = check_file(file) assert not report["is_check_passed"] From c933f1643e56d9099f442975e47d764959033a18 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Fri, 8 Nov 2024 16:30:59 +0100 Subject: [PATCH 11/21] style and fixed --- src/together/utils/files.py | 16 ++++++++-------- tests/unit/test_files_checks.py | 9 ++++++--- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/together/utils/files.py b/src/together/utils/files.py index c382d3bb..95f2fab1 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -236,6 +236,14 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["load_json"] = True + except InvalidFileFormatError as e: + report_dict["load_json"] = False + report_dict["is_check_passed"] = False + report_dict["message"] = e.message + if e.line_number is not None: + report_dict["line_number"] = e.line_number + if e.error_source is not None: + report_dict[e.error_source] = False except ValueError: report_dict["load_json"] = False if idx < 0: @@ -248,14 +256,6 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: f"Error parsing json payload. Unexpected format on line {idx + 1}." ) report_dict["is_check_passed"] = False - except InvalidFileFormatError as e: - report_dict["load_json"] = False - report_dict["is_check_passed"] = False - report_dict["message"] = e.message - if e.line_number is not None: - report_dict["line_number"] = e.line_number - if e.error_source is not None: - report_dict[e.error_source] = False if "text_field" not in report_dict: report_dict["text_field"] = True diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index 0412be64..dcabb6e3 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -26,7 +26,10 @@ def test_check_jsonl_valid_instruction(tmp_path: Path): file = tmp_path / "valid_instruction.jsonl" content = [ {"prompt": "Translate the following sentence.", "completion": "Hello, world!"}, - {"prompt": "Summarize the text.", "completion": "Weyland-Yutani Corporation creates advanced AI."}, + { + "prompt": "Summarize the text.", + "completion": "Weyland-Yutani Corporation creates advanced AI.", + }, ] with file.open("w") as f: f.write("\n".join(json.dumps(item) for item in content)) @@ -187,7 +190,7 @@ def test_check_jsonl_invalid_role(tmp_path: Path): report = check_file(file) assert not report["is_check_passed"] - assert "Found invalid role 'invalid_role'" in report["message"] + assert "Found invalid role `invalid_role`" in report["message"] def test_check_jsonl_non_alternating_roles(tmp_path: Path): @@ -237,4 +240,4 @@ def test_check_jsonl_missing_field_in_conversation(tmp_path: Path): report = check_file(file) assert not report["is_check_passed"] - assert "Field 'content' is missing for a turn" in report["message"] + assert "Field `content` is missing for a turn" in report["message"] From 642333044e9eacd6e047c802530d27e7f55cc7bf Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Tue, 12 Nov 2024 17:10:49 +0100 Subject: [PATCH 12/21] pr feedback --- src/together/cli/api/finetune.py | 6 ++++-- src/together/resources/finetune.py | 11 ++++++++--- src/together/utils/files.py | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index 4d4896b2..1983f2b6 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -12,7 +12,7 @@ from together import Together from together.cli.api.utils import BOOL_WITH_AUTO, INT_WITH_MAX -from together.utils import finetune_price_to_dollars, log_warn, parse_timestamp +from together.utils import finetune_price_to_dollars, log_warn, log_warn_once, parse_timestamp from together.types.finetune import DownloadCheckpointType, FinetuneTrainingLimits @@ -97,7 +97,8 @@ def fine_tuning(ctx: click.Context) -> None: "--train-on-inputs", type=BOOL_WITH_AUTO, default="auto", - help="Whether to mask the user messages in conversational data or prompts in instruction data", + help="Whether to mask the user messages in conversational data or prompts in instruction data. " + "`auto` will automatically determine whether to mask the inputs based on the data format.", ) def create( ctx: click.Context, @@ -148,6 +149,7 @@ def create( ) if lora: + log_warn_once("LoRA rank default has been changed from 8 to 64 as the maximum available for each model.") if model_limits.lora_training is None: raise click.BadParameter( f"LoRA fine-tuning is not supported for the model `{model}`" diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 3c17c6ad..79596dd2 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -159,9 +159,9 @@ def create( Defaults to None. train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data. "auto" will automatically determine whether to mask the inputs based on the data format. - Dataset with "text" (General format) field will not mask the inputs by default. - Dataset with "messages" (Conversational format) or "prompt" and "completion" (Instruction format) - fields will mask the inputs by default. + For datasets with the "text" field (general format), inputs will not be masked. + For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields + (Instruction format), inputs will be masked. Defaults to "auto". Returns: @@ -477,6 +477,11 @@ async def create( model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning. Defaults to None. train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data. + "auto" will automatically determine whether to mask the inputs based on the data format. + For datasets with the "text" field (general format), inputs will not be masked. + For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields + (Instruction format), inputs will be masked. + Defaults to "auto". Returns: FinetuneResponse: Object containing information about fine-tuning job. diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 95f2fab1..360a0bce 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -162,7 +162,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: error_source="key_value", ) - previous_role = "" + previous_role = None for turn in json_line[message_column]: for column in REQUIRED_COLUMNS_MESSAGE: if column not in turn: From 85c51c4f90efa35eecd33a848c5b61b575348fcf Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Tue, 12 Nov 2024 17:13:33 +0100 Subject: [PATCH 13/21] style --- src/together/cli/api/finetune.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index 1983f2b6..74f6a483 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -12,7 +12,12 @@ from together import Together from together.cli.api.utils import BOOL_WITH_AUTO, INT_WITH_MAX -from together.utils import finetune_price_to_dollars, log_warn, log_warn_once, parse_timestamp +from together.utils import ( + finetune_price_to_dollars, + log_warn, + log_warn_once, + parse_timestamp, +) from together.types.finetune import DownloadCheckpointType, FinetuneTrainingLimits @@ -149,7 +154,9 @@ def create( ) if lora: - log_warn_once("LoRA rank default has been changed from 8 to 64 as the maximum available for each model.") + log_warn_once( + "LoRA rank default has been changed from 8 to 64 as the maximum available for each model." + ) if model_limits.lora_training is None: raise click.BadParameter( f"LoRA fine-tuning is not supported for the model `{model}`" From 6268151a65d564e833ebb66a04304d91d88d9df2 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Wed, 13 Nov 2024 11:19:34 +0100 Subject: [PATCH 14/21] style --- src/together/utils/files.py | 9 +++++++++ tests/unit/test_files_checks.py | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 360a0bce..570aa508 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -162,6 +162,15 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: error_source="key_value", ) + for turn_id, turn in enumerate(json_line[message_column]): + if not isinstance(turn, dict): + raise InvalidFileFormatError( + message=f"Invalid format on line {idx + 1} of the input file. " + f"Expected a dictionary in the {turn_id + 1} turn. Found {type(turn)}", + line_number=idx + 1, + error_source="key_value", + ) + previous_role = None for turn in json_line[message_column]: for column in REQUIRED_COLUMNS_MESSAGE: diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index dcabb6e3..e79f1986 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -241,3 +241,25 @@ def test_check_jsonl_missing_field_in_conversation(tmp_path: Path): report = check_file(file) assert not report["is_check_passed"] assert "Field `content` is missing for a turn" in report["message"] + + +def test_check_jsonl_wrong_turn_type(tmp_path: Path): + file = tmp_path / "wrong_turn_type.jsonl" + content = [ + { + "messages": [ + "Hi!", + {"role": "user", "content": "Hi"}, + {"role": "assistant"}, + ] + } + ] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + assert not report["is_check_passed"] + assert ( + "Invalid format on line 1 of the input file. Expected a dictionary" + in report["message"] + ) From 6f3f8e3e39c2c504cbcb015be935fde986dc6ffc Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Wed, 13 Nov 2024 11:33:24 +0100 Subject: [PATCH 15/21] fix typing --- src/together/resources/finetune.py | 4 +++- src/together/types/finetune.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 79596dd2..841b5913 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -83,6 +83,8 @@ def createFinetuneRequest( if warmup_ratio > 1 or warmup_ratio < 0: raise ValueError("Warmup ratio should be between 0 and 1") + train_on_inputs_bool = train_on_inputs if train_on_inputs != "auto" else None + finetune_request = FinetuneRequest( model=model, training_file=training_file, @@ -96,7 +98,7 @@ def createFinetuneRequest( training_type=training_type, suffix=suffix, wandb_key=wandb_api_key, - train_on_inputs=train_on_inputs, + train_on_inputs=train_on_inputs_bool, ) return finetune_request diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index 917a5143..e7733458 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -163,7 +163,7 @@ class FinetuneRequest(BaseModel): # weights & biases api key wandb_key: str | None = None training_type: FullTrainingType | LoRATrainingType | None = None - train_on_inputs: bool | Literal["auto"] = "auto" + train_on_inputs: bool | None = None class FinetuneResponse(BaseModel): @@ -231,7 +231,7 @@ class FinetuneResponse(BaseModel): # training file metadata training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines") training_file_size: int | None = Field(None, alias="TrainingFileSize") - train_on_inputs: bool | Literal["auto"] = "auto" + train_on_inputs: bool | None = None @field_validator("training_type") @classmethod From 567abdad071f7b231e2f8d8763baa2ea668b7c97 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Wed, 13 Nov 2024 16:46:18 +0100 Subject: [PATCH 16/21] change to strict boolean --- src/together/types/finetune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index e7733458..7ca28486 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -3,7 +3,7 @@ from enum import Enum from typing import List, Literal -from pydantic import Field, validator, field_validator +from pydantic import StrictBool, Field, validator, field_validator from together.types.abstract import BaseModel from together.types.common import ( @@ -163,7 +163,7 @@ class FinetuneRequest(BaseModel): # weights & biases api key wandb_key: str | None = None training_type: FullTrainingType | LoRATrainingType | None = None - train_on_inputs: bool | None = None + train_on_inputs: StrictBool | None = None class FinetuneResponse(BaseModel): From 268ca77ea13d92610bd10896a8b42324c7f59b27 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Wed, 13 Nov 2024 16:47:28 +0100 Subject: [PATCH 17/21] error out on train_on_inputs --- src/together/resources/finetune.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 841b5913..b686f6c9 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -83,6 +83,9 @@ def createFinetuneRequest( if warmup_ratio > 1 or warmup_ratio < 0: raise ValueError("Warmup ratio should be between 0 and 1") + if train_on_inputs is None: + raise ValueError("train_on_inputs cannot be None") + train_on_inputs_bool = train_on_inputs if train_on_inputs != "auto" else None finetune_request = FinetuneRequest( From f8c6166dc5a8693608ae5fc577b2dfe489c699d4 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Thu, 14 Nov 2024 17:37:49 +0100 Subject: [PATCH 18/21] use "auto" directly --- src/together/cli/api/finetune.py | 7 ++++--- src/together/resources/finetune.py | 7 +------ src/together/types/finetune.py | 4 ++-- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index 74f6a483..bd9c8514 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -154,9 +154,6 @@ def create( ) if lora: - log_warn_once( - "LoRA rank default has been changed from 8 to 64 as the maximum available for each model." - ) if model_limits.lora_training is None: raise click.BadParameter( f"LoRA fine-tuning is not supported for the model `{model}`" @@ -167,6 +164,10 @@ def create( "batch_size": model_limits.lora_training.max_batch_size, "learning_rate": 1e-3, } + log_warn_once( + f"LoRA rank default has been changed to {default_values['lora_r']} as the max available for the model.\n" + f"Learning rate default for LoRA FT has been changed to {default_values['learning_rate']}." + ) for arg in default_values: arg_source = ctx.get_parameter_source("arg") # type: ignore[attr-defined] if arg_source == ParameterSource.DEFAULT: diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index b686f6c9..79596dd2 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -83,11 +83,6 @@ def createFinetuneRequest( if warmup_ratio > 1 or warmup_ratio < 0: raise ValueError("Warmup ratio should be between 0 and 1") - if train_on_inputs is None: - raise ValueError("train_on_inputs cannot be None") - - train_on_inputs_bool = train_on_inputs if train_on_inputs != "auto" else None - finetune_request = FinetuneRequest( model=model, training_file=training_file, @@ -101,7 +96,7 @@ def createFinetuneRequest( training_type=training_type, suffix=suffix, wandb_key=wandb_api_key, - train_on_inputs=train_on_inputs_bool, + train_on_inputs=train_on_inputs, ) return finetune_request diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index 7ca28486..f1fabb04 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -163,7 +163,7 @@ class FinetuneRequest(BaseModel): # weights & biases api key wandb_key: str | None = None training_type: FullTrainingType | LoRATrainingType | None = None - train_on_inputs: StrictBool | None = None + train_on_inputs: StrictBool | Literal["auto"] = "auto" class FinetuneResponse(BaseModel): @@ -231,7 +231,7 @@ class FinetuneResponse(BaseModel): # training file metadata training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines") training_file_size: int | None = Field(None, alias="TrainingFileSize") - train_on_inputs: bool | None = None + train_on_inputs: StrictBool | Literal["auto"] | None = "auto" @field_validator("training_type") @classmethod From b1f3a17de245c6b716cf455786a2ce9a1997674d Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Thu, 14 Nov 2024 17:40:29 +0100 Subject: [PATCH 19/21] add system message --- tests/unit/test_files_checks.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index e79f1986..65f59f61 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -58,6 +58,13 @@ def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path): {"role": "assistant", "content": "I am fine."}, ] }, + { + "messages": [ + {"role": "system", "content": "You are a kind AI"}, + {"role": "user", "content": "How are you?"}, + {"role": "assistant", "content": "I am fine."}, + ] + }, ] with file.open("w") as f: f.write("\n".join(json.dumps(item) for item in content)) @@ -93,6 +100,15 @@ def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path): {"role": "assistant", "content": "It is cloudy with a chance of snow."}, ] }, + { + "messages": [ + {"role": "system", "content": "You are a kind AI"}, + {"role": "user", "content": "Who won the game last night?"}, + {"role": "assistant", "content": "The home team won by two points."}, + {"role": "user", "content": "What is the weather like in Amsterdam?"}, + {"role": "assistant", "content": "It is cloudy with a chance of snow."}, + ] + }, ] with file.open("w") as f: f.write("\n".join(json.dumps(item) for item in content)) From 387a23ba94faf23c5adbe7c8d89d31a9790ead71 Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Thu, 14 Nov 2024 17:42:09 +0100 Subject: [PATCH 20/21] version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a29fba8b..4c475deb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.3.3" +version = "1.3.4" authors = [ "Together AI " ] From 34d9177eff65987f38d07cf85ae739a569cf804c Mon Sep 17 00:00:00 2001 From: Artem Chumachenko Date: Thu, 14 Nov 2024 18:03:48 +0100 Subject: [PATCH 21/21] Update src/together/cli/api/finetune.py Co-authored-by: Max Ryabinin --- src/together/cli/api/finetune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index bd9c8514..bd509e60 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -165,8 +165,8 @@ def create( "learning_rate": 1e-3, } log_warn_once( - f"LoRA rank default has been changed to {default_values['lora_r']} as the max available for the model.\n" - f"Learning rate default for LoRA FT has been changed to {default_values['learning_rate']}." + f"The default LoRA rank for {model} has been changed to {default_values['lora_r']} as the max available.\n" + f"Also, the default learning rate for LoRA fine-tuning has been changed to {default_values['learning_rate']}." ) for arg in default_values: arg_source = ctx.get_parameter_source("arg") # type: ignore[attr-defined]