From f55bb8cfca0451df88c4964659944a43364399d1 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Tue, 1 Oct 2024 14:42:35 +0200
Subject: [PATCH 01/21] Add format checks

---
 src/together/constants.py   | 23 +++++++++++
 src/together/utils/files.py | 81 ++++++++++++++++++++++++++++++-------
 2 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/src/together/constants.py b/src/together/constants.py
index b4c9cf3b..3e3399e5 100644
--- a/src/together/constants.py
+++ b/src/together/constants.py
@@ -1,3 +1,5 @@
+import enum
+
 # Session constants
 TIMEOUT_SECS = 600
 MAX_SESSION_LIFETIME_SECS = 180
@@ -29,3 +31,24 @@
 
 # expected columns for Parquet files
 PARQUET_EXPECTED_COLUMNS = ["input_ids", "attention_mask", "labels"]
+
+
+class DatasetFormat(enum.Enum):
+    """Dataset format enum.
+
+    Args:
+        enum (enum.Enum): Enum class for dataset format.
+    """
+
+    GENERAL = "general"
+    CONVERSATION = "conversation"
+    INSTRUCTION = "instruction"
+
+
+JSONL_REQUIRED_COLUMNS_MAP = {
+    DatasetFormat.GENERAL: ["text"],
+    DatasetFormat.CONVERSATION: ["messages"],
+    DatasetFormat.INSTRUCTION: ["prompt", "completion"],
+}
+REQUIRED_COLUMNS_CONVERSATION = ["role", "content"]
+POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"]
\ No newline at end of file
diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index 875f4d84..ba342e4f 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -13,6 +13,10 @@
     MIN_SAMPLES,
     NUM_BYTES_IN_GB,
     PARQUET_EXPECTED_COLUMNS,
+    JSONL_REQUIRED_COLUMNS_MAP,
+    REQUIRED_COLUMNS_CONVERSATION,
+    POSSIBLE_ROLES_CONVERSATION,
+    DatasetFormat
 )
 
 
@@ -88,6 +92,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
         report_dict["is_check_passed"] = False
         return report_dict
 
+    dataset_format = None
     with file.open() as f:
         # idx must be instantiated so decode errors (e.g. file is a tar) or empty files are caught
         idx = -1
@@ -104,23 +109,68 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
 
                     report_dict["is_check_passed"] = False
 
-                if "text" not in json_line.keys():
-                    report_dict["text_field"] = False
-                    report_dict["message"] = (
-                        f"Missing 'text' field was found on line {idx + 1} of the the input file. "
-                        "Expected format: {'text': 'my sample string'}. "
-                    )
-                    report_dict["is_check_passed"] = False
-                else:
-                    # check to make sure the value of the "text" key is a string
-                    if not isinstance(json_line["text"], str):
-                        report_dict["key_value"] = False
+                if dataset_format is None:
+                    for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
+                        if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]):
+                            if dataset_format is not None:
+                                report_dict["message"] = (
+                                    "All samples in the dataset must have the same dataset format. "
+                                    f"Got {dataset_format} for the first line and {possible_format} "
+                                    f"for the {idx + 1} line."
+                                )
+                                raise KeyError
+                            dataset_format = possible_format
+                    if dataset_format is None:
                         report_dict["message"] = (
-                            f'Invalid value type for "text" key on line {idx + 1}. '
-                            f'Expected string. Found {type(json_line["text"])}.'
+                            "Error parsing file. Could not detect a possible format for the line with the columns:\n"
+                            f"{json_line.keys()}"
                         )
+                        raise KeyError
 
+                for column in JSONL_REQUIRED_COLUMNS_MAP[dataset_format]:
+                    if column not in json_line.keys():
+                        report_dict["text_field"] = False
+                        report_dict["message"] = (
+                            f"Missing '{column}' field was found on line {idx + 1} of the the input file."
+                        )
                         report_dict["is_check_passed"] = False
+                        raise KeyError
+
+                if dataset_format == DatasetFormat.CONVERSATION:
+                    message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION]
+                    if not isinstance(json_line[message_column], dict):
+                        report_dict["key_value"] = False
+                        report_dict["message"] = (
+                            f"Invalid format on line {idx + 1} of the input file. "
+                            f"Expected dict. Found {type(json_line[message_column])}"
+                        )
+                        raise KeyError
+                    for column in REQUIRED_COLUMNS_CONVERSATION:
+                        if column not in json_line[message_column].keys():
+                            report_dict["key_value"] = False
+                            report_dict["message"] = (
+                                f"Missing '{column}' field was found on line {idx + 1} of the the input file."
+                            )
+                            raise KeyError
+                        else:
+                            if isinstance(json_line[message_column][column], str):
+                                report_dict["text_field"] = False
+                                report_dict["message"] = (
+                                    f"Invalid format on line {idx + 1} in the column {column} of the input file. "
+                                    f"Expected string. found {type(json_line[message_column][column])}"
+                                )
+                    pass
+                else:
+                    # check to make sure the value of the keys is a string
+                    for column in JSONL_REQUIRED_COLUMNS_MAP[dataset_format]:
+                        if not isinstance(json_line[column], str):
+                            report_dict["key_value"] = False
+                            report_dict["message"] = (
+                                f'Invalid value type for "{column}" key on line {idx + 1}. '
+                                f'Expected string. Found {type(json_line[column])}.'
+                            )
+
+                            report_dict["is_check_passed"] = False
 
             # make sure this is outside the for idx, line in enumerate(f): for loop
             if idx + 1 < MIN_SAMPLES:
@@ -136,7 +186,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
 
             report_dict["load_json"] = True
 
-        except ValueError:
+        except ValueError as _:
             report_dict["load_json"] = False
             if idx < 0:
                 report_dict["message"] = (
@@ -148,6 +198,9 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                     f"Error parsing json payload. Unexpected format on line {idx + 1}."
                 )
             report_dict["is_check_passed"] = False
+        except KeyError as _:
+            report_dict["load_json"] = False
+            report_dict["is_check_passed"] = False
 
     if "text_field" not in report_dict:
         report_dict["text_field"] = True

From 8c5106cb8284ccfb18dad851f2f1739faa595e0b Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Thu, 3 Oct 2024 11:43:38 +0200
Subject: [PATCH 02/21] add tests

---
 src/together/utils/files.py     |  87 ++++++++++------
 tests/unit/test_files_checks.py | 172 ++++++++++++++++++++++++++++++++
 2 files changed, 231 insertions(+), 28 deletions(-)
 create mode 100644 tests/unit/test_files_checks.py

diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index ba342e4f..5e7bdeae 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -109,21 +109,32 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
 
                     report_dict["is_check_passed"] = False
 
+                current_format = None
+                for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
+                    if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]):
+                        if current_format is None:
+                            current_format = possible_format
+                        elif current_format != possible_format:
+                            report_dict["message"] = (
+                                "Found multiple dataset formats in the input file. "
+                                f"Got {current_format} and {possible_format} on line {idx + 1}."
+                            )
+                            raise KeyError
+                if current_format is None and dataset_format is None:
+                    report_dict["message"] = (
+                        "Error parsing file. Could not detect a possible format for the line with the columns:\n"
+                        f"{json_line.keys()}"
+                    )
+                    raise KeyError
+
                 if dataset_format is None:
-                    for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
-                        if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]):
-                            if dataset_format is not None:
-                                report_dict["message"] = (
-                                    "All samples in the dataset must have the same dataset format. "
-                                    f"Got {dataset_format} for the first line and {possible_format} "
-                                    f"for the {idx + 1} line."
-                                )
-                                raise KeyError
-                            dataset_format = possible_format
-                    if dataset_format is None:
+                    dataset_format = current_format
+                elif current_format is not None:
+                    if current_format != dataset_format:
                         report_dict["message"] = (
-                            "Error parsing file. Could not detect a possible format for the line with the columns:\n"
-                            f"{json_line.keys()}"
+                            "All samples in the dataset must have the same dataset format. "
+                            f"Got {dataset_format} for the first line and {current_format} "
+                            f"for the {idx + 1} line."
                         )
                         raise KeyError
 
@@ -137,29 +148,49 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                         raise KeyError
 
                 if dataset_format == DatasetFormat.CONVERSATION:
-                    message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION]
-                    if not isinstance(json_line[message_column], dict):
+                    message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION][0]
+                    if not isinstance(json_line[message_column], list):
                         report_dict["key_value"] = False
                         report_dict["message"] = (
                             f"Invalid format on line {idx + 1} of the input file. "
-                            f"Expected dict. Found {type(json_line[message_column])}"
+                            f"Expected list. Found {type(json_line[message_column])}"
                         )
                         raise KeyError
                     for column in REQUIRED_COLUMNS_CONVERSATION:
-                        if column not in json_line[message_column].keys():
+                        for turn in json_line[message_column]:
+                            if column not in turn.keys():
+                                report_dict["key_value"] = False
+                                report_dict["message"] = (
+                                    f"Missing '{column}' in a turn was found on line {idx + 1} of the the input file."
+                                )
+                                raise KeyError
+                            else:
+                                if not isinstance(turn[column], str):
+                                    report_dict["text_field"] = False
+                                    report_dict["message"] = (
+                                        f"Invalid format on line {idx + 1} in the column {column} of the input file. "
+                                        f"Expected string. found {type(turn[column])}"
+                                    )
+                                    raise KeyError
+
+                    roles = set(turn["role"] for turn in json_line["messages"])
+                    for role in roles:
+                        if role not in POSSIBLE_ROLES_CONVERSATION:
                             report_dict["key_value"] = False
                             report_dict["message"] = (
-                                f"Missing '{column}' field was found on line {idx + 1} of the the input file."
+                                f"Found invalid role '{role}' in the messages on the line {idx + 1}. "
+                                f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}"
                             )
                             raise KeyError
-                        else:
-                            if isinstance(json_line[message_column][column], str):
-                                report_dict["text_field"] = False
-                                report_dict["message"] = (
-                                    f"Invalid format on line {idx + 1} in the column {column} of the input file. "
-                                    f"Expected string. found {type(json_line[message_column][column])}"
-                                )
-                    pass
+
+                    is_user_turn = [turn["role"] == "user" for turn in json_line["messages"]]
+                    if any(i == j for i, j in zip(is_user_turn[1:], is_user_turn[:1])):
+                        report_dict["key_value"] = False
+                        report_dict["message"] = (
+                            f"Invalid role turns on line {idx + 1} of the input file. "
+                            "'user' and 'assistant' roles must alternate user/assistant/user/assistant/..."
+                        )
+                        raise KeyError
                 else:
                     # check to make sure the value of the keys is a string
                     for column in JSONL_REQUIRED_COLUMNS_MAP[dataset_format]:
@@ -169,8 +200,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                                 f'Invalid value type for "{column}" key on line {idx + 1}. '
                                 f'Expected string. Found {type(json_line[column])}.'
                             )
-
-                            report_dict["is_check_passed"] = False
+                            raise KeyError
 
             # make sure this is outside the for idx, line in enumerate(f): for loop
             if idx + 1 < MIN_SAMPLES:
@@ -183,6 +213,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
             else:
                 report_dict["num_samples"] = idx + 1
                 report_dict["min_samples"] = True
+                report_dict["is_check_passed"] = True
 
             report_dict["load_json"] = True
 
diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py
new file mode 100644
index 00000000..a8b80131
--- /dev/null
+++ b/tests/unit/test_files_checks.py
@@ -0,0 +1,172 @@
+import pytest
+from pathlib import Path
+from together.utils.files import check_file
+
+
+def test_check_jsonl_valid_general(tmp_path: Path):
+    # Create a valid JSONL file
+    file = tmp_path / "valid.jsonl"
+    content = [
+        '{"text": "Hello, world!"}',
+        '{"text": "How are you?"}'
+    ]
+    file.write_text("\n".join(content), encoding="utf-8")
+    
+    report = check_file(file)
+    
+    assert report["is_check_passed"]
+    assert report["utf8"]
+    assert report["num_samples"] == 2
+    assert report["min_samples"]
+
+
+def test_check_jsonl_valid_instruction(tmp_path: Path):
+    # Create a valid JSONL file with instruction format
+    file = tmp_path / "valid_instruction.jsonl"
+    content = [
+        '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}',
+        '{"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."}'
+    ]
+    file.write_text("\n".join(content), encoding="utf-8")
+    
+    report = check_file(file)
+    
+    assert report["is_check_passed"]
+    assert report["utf8"]
+    assert report["num_samples"] == 2
+    assert report["min_samples"]
+
+
+def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path):
+    # Create a valid JSONL file with conversational format and 1 user-assistant turn pair
+    file = tmp_path / "valid_conversational_single_turn.jsonl"
+    content = [
+        '{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]}',
+        '{"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]}'
+    ]
+    file.write_text("\n".join(content), encoding="utf-8")
+    
+    report = check_file(file)
+    
+    print(report)
+    
+    assert report["is_check_passed"]
+    assert report["utf8"]
+    assert report["num_samples"] == 2
+    assert report["min_samples"]
+
+
+def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path):
+    # Create a valid JSONL file with conversational format and multiple user-assistant turn pairs
+    file = tmp_path / "valid_conversational_multiple_turns.jsonl"
+    content = [
+        '{"messages": [{"role": "user", "content": "Is it going to rain today?"}, {"role": "assistant", "content": "Yes, expect showers in the afternoon."}]}',
+        '{"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}]}'
+    ]
+    file.write_text("\n".join(content), encoding="utf-8")
+    
+    report = check_file(file)
+    
+    assert report["is_check_passed"]
+    assert report["utf8"]
+    assert report["num_samples"] == 2
+    assert report["min_samples"]
+
+
+def test_check_jsonl_empty_file(tmp_path: Path):
+    # Create an empty JSONL file
+    file = tmp_path / "empty.jsonl"
+    file.touch()
+    
+    report = check_file(file)
+    
+    print(report)
+    
+    assert not report["is_check_passed"]
+    assert report["message"] == "File is empty"
+    assert report["file_size"] == 0
+
+
+def test_check_jsonl_non_utf8(tmp_path: Path):
+    # Create a non-UTF-8 encoded JSONL file
+    file = tmp_path / "non_utf8.jsonl"
+    file.write_bytes(b'\xff\xfe\xfd')
+    
+    report = check_file(file)
+    
+    assert not report["is_check_passed"]
+    assert not report["utf8"]
+    assert "File is not UTF-8 encoded." in report["message"]
+
+
+def test_check_jsonl_invalid_json(tmp_path: Path):
+    # Create a JSONL file with invalid JSON
+    file = tmp_path / "invalid_json.jsonl"
+    content = [
+        '{"text": "Hello, world!"}',
+        'Invalid JSON Line'
+    ]
+    file.write_text("\n".join(content), encoding="utf-8")
+    
+    report = check_file(file)
+    
+    assert not report["is_check_passed"]
+    assert "Error parsing json payload" in report["message"]
+
+
+def test_check_jsonl_missing_required_field(tmp_path: Path):
+    # Create a JSONL file missing a required field
+    file = tmp_path / "missing_field.jsonl"
+    content = [
+        '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}',
+        '{"prompt": "Summarize the text."}'
+    ]
+    file.write_text("\n".join(content), encoding="utf-8")
+    
+    report = check_file(file)
+    
+    assert not report["is_check_passed"]
+    assert "Missing 'completion' field was found on line 2" in report["message"]
+
+
+def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path):
+    # Create a JSONL file with inconsistent dataset formats
+    file = tmp_path / "inconsistent_format.jsonl"
+    content = [
+        '{"messages": [{"role": "user", "content": "Hi"}]}',
+        '{"text": "How are you?"}'  # Missing 'messages'
+    ]
+    file.write_text("\n".join(content), encoding="utf-8")
+    
+    report = check_file(file)
+    
+    assert not report["is_check_passed"]
+    assert "All samples in the dataset must have the same dataset format" in report["message"]
+
+
+def test_check_jsonl_invalid_role(tmp_path: Path):
+    # Create a JSONL file with an invalid role
+    file = tmp_path / "invalid_role.jsonl"
+    content = [
+        '{"messages": [{"role": "invalid_role", "content": "Hi"}]}'
+    ]
+    file.write_text("\n".join(content), encoding="utf-8")
+    
+    report = check_file(file)
+    
+    assert not report["is_check_passed"]
+    assert "Found invalid role 'invalid_role'" in report["message"]
+
+
+def test_check_jsonl_non_alternating_roles(tmp_path: Path):
+    # Create a JSONL file with non-alternating user/assistant roles
+    file = tmp_path / "non_alternating_roles.jsonl"
+    content = [
+        '{"messages": [{"role": "user", "content": "Hi"}, {"role": "user", "content": "Hello again"}]}'
+    ]
+    file.write_text("\n".join(content), encoding="utf-8")
+    
+    report = check_file(file)
+    
+    assert not report["is_check_passed"]
+    assert "Invalid role turns" in report["message"]

From d90b4a545f94a5c592a51b178ed07126c3e13346 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Mon, 4 Nov 2024 16:32:23 +0100
Subject: [PATCH 03/21] add train on inputs flag

---
 src/together/cli/api/finetune.py   | 22 +++++-----------------
 src/together/cli/api/utils.py      | 21 +++++++++++++++++++++
 src/together/resources/finetune.py |  6 ++++++
 src/together/types/finetune.py     |  2 ++
 4 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
index 3fdbd74b..334eb290 100644
--- a/src/together/cli/api/finetune.py
+++ b/src/together/cli/api/finetune.py
@@ -11,7 +11,7 @@
 from tabulate import tabulate
 
 from together import Together
-from together.cli.api.utils import INT_WITH_MAX
+from together.cli.api.utils import BOOL_WITH_AUTO, INT_WITH_MAX
 from together.utils import finetune_price_to_dollars, log_warn, parse_timestamp
 from together.types.finetune import DownloadCheckpointType, FinetuneTrainingLimits
 
@@ -93,6 +93,7 @@ def fine_tuning(ctx: click.Context) -> None:
     default=False,
     help="Whether to skip the launch confirmation message",
 )
+@click.option("--train-on-inputs", type=BOOL_WITH_AUTO, default=True, help="Whether to mask the inputs in conversational data")
 def create(
     ctx: click.Context,
     training_file: str,
@@ -112,6 +113,7 @@ def create(
     suffix: str,
     wandb_api_key: str,
     confirm: bool,
+    train_on_inputs: bool | Literal["auto"],
 ) -> None:
     """Start fine-tuning"""
     client: Together = ctx.obj
@@ -133,6 +135,7 @@ def create(
         lora_trainable_modules=lora_trainable_modules,
         suffix=suffix,
         wandb_api_key=wandb_api_key,
+        train_on_inputs=train_on_inputs,
     )
 
     model_limits: FinetuneTrainingLimits = client.fine_tuning.get_model_limits(
@@ -186,22 +189,7 @@ def create(
 
     if confirm or click.confirm(_CONFIRMATION_MESSAGE, default=True, show_default=True):
         response = client.fine_tuning.create(
-            training_file=training_file,
-            model=model,
-            n_epochs=n_epochs,
-            validation_file=validation_file,
-            n_evals=n_evals,
-            n_checkpoints=n_checkpoints,
-            batch_size=batch_size,
-            learning_rate=learning_rate,
-            warmup_ratio=warmup_ratio,
-            lora=lora,
-            lora_r=lora_r,
-            lora_dropout=lora_dropout,
-            lora_alpha=lora_alpha,
-            lora_trainable_modules=lora_trainable_modules,
-            suffix=suffix,
-            wandb_api_key=wandb_api_key,
+            **training_args,
             verbose=True,
         )
 
diff --git a/src/together/cli/api/utils.py b/src/together/cli/api/utils.py
index 3f85f380..e9a7308f 100644
--- a/src/together/cli/api/utils.py
+++ b/src/together/cli/api/utils.py
@@ -27,4 +27,25 @@ def convert(
             )
 
 
+class BooleanWithAutoParamType(click.ParamType):
+    name = "boolean_or_auto"
+
+    def convert(
+            self, value: str, param: click.Parameter | None, ctx: click.Context | None
+    ) -> bool | Literal["auto"] | None:
+        if value == "auto":
+            return "auto"
+        try:
+            return bool(value)
+        except ValueError:
+            self.fail(
+                _("{value!r} is not a valid {boolean_type}.").format(
+                    value=value, boolean_type=self.name
+                ),
+                param,
+                ctx,
+            )
+
+
 INT_WITH_MAX = AutoIntParamType()
+BOOL_WITH_AUTO = BooleanWithAutoParamType()
\ No newline at end of file
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
index 44d74f2b..e6d3c350 100644
--- a/src/together/resources/finetune.py
+++ b/src/together/resources/finetune.py
@@ -125,6 +125,7 @@ def create(
         wandb_api_key: str | None = None,
         verbose: bool = False,
         model_limits: FinetuneTrainingLimits | None = None,
+        train_on_inputs: bool | Literal["auto"] = "auto",
     ) -> FinetuneResponse:
         """
         Method to initiate a fine-tuning job
@@ -154,6 +155,7 @@ def create(
                 Defaults to False.
             model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
                 Defaults to None.
+            train_on_inputs (bool, optional): Whether to mask the inputs in conversational data. Defaults to "auto".
 
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
@@ -184,6 +186,7 @@ def create(
             lora_trainable_modules=lora_trainable_modules,
             suffix=suffix,
             wandb_api_key=wandb_api_key,
+            train_on_inputs=train_on_inputs,
         )
 
         if verbose:
@@ -436,6 +439,7 @@ async def create(
         wandb_api_key: str | None = None,
         verbose: bool = False,
         model_limits: FinetuneTrainingLimits | None = None,
+        train_on_inputs: bool | Literal["auto"] = "auto"
     ) -> FinetuneResponse:
         """
         Async method to initiate a fine-tuning job
@@ -465,6 +469,7 @@ async def create(
                 Defaults to False.
             model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
                 Defaults to None.
+            train_on_inputs (bool, optional): Whether to mask the inputs in conversational data. Defaults to "auto".
 
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
@@ -495,6 +500,7 @@ async def create(
             lora_trainable_modules=lora_trainable_modules,
             suffix=suffix,
             wandb_api_key=wandb_api_key,
+            train_on_inputs=train_on_inputs,
         )
 
         if verbose:
diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
index 2f76c446..917a5143 100644
--- a/src/together/types/finetune.py
+++ b/src/together/types/finetune.py
@@ -163,6 +163,7 @@ class FinetuneRequest(BaseModel):
     # weights & biases api key
     wandb_key: str | None = None
     training_type: FullTrainingType | LoRATrainingType | None = None
+    train_on_inputs: bool | Literal["auto"] = "auto"
 
 
 class FinetuneResponse(BaseModel):
@@ -230,6 +231,7 @@ class FinetuneResponse(BaseModel):
     # training file metadata
     training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines")
     training_file_size: int | None = Field(None, alias="TrainingFileSize")
+    train_on_inputs: bool | Literal["auto"] = "auto"
 
     @field_validator("training_type")
     @classmethod

From 8be70ac066c796ff5bc76468ec21e7754d5eb367 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Mon, 4 Nov 2024 16:49:52 +0100
Subject: [PATCH 04/21] style

---
 src/together/cli/api/finetune.py   |  7 ++-
 src/together/cli/api/utils.py      |  4 +-
 src/together/constants.py          |  2 +-
 src/together/resources/finetune.py |  4 +-
 src/together/utils/files.py        | 17 +++++--
 tests/unit/test_files_checks.py    | 79 ++++++++++++++----------------
 6 files changed, 61 insertions(+), 52 deletions(-)

diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
index 334eb290..fec29a8a 100644
--- a/src/together/cli/api/finetune.py
+++ b/src/together/cli/api/finetune.py
@@ -93,7 +93,12 @@ def fine_tuning(ctx: click.Context) -> None:
     default=False,
     help="Whether to skip the launch confirmation message",
 )
-@click.option("--train-on-inputs", type=BOOL_WITH_AUTO, default=True, help="Whether to mask the inputs in conversational data")
+@click.option(
+    "--train-on-inputs",
+    type=BOOL_WITH_AUTO,
+    default=True,
+    help="Whether to mask the inputs in conversational data",
+)
 def create(
     ctx: click.Context,
     training_file: str,
diff --git a/src/together/cli/api/utils.py b/src/together/cli/api/utils.py
index e9a7308f..7f80a68b 100644
--- a/src/together/cli/api/utils.py
+++ b/src/together/cli/api/utils.py
@@ -31,7 +31,7 @@ class BooleanWithAutoParamType(click.ParamType):
     name = "boolean_or_auto"
 
     def convert(
-            self, value: str, param: click.Parameter | None, ctx: click.Context | None
+        self, value: str, param: click.Parameter | None, ctx: click.Context | None
     ) -> bool | Literal["auto"] | None:
         if value == "auto":
             return "auto"
@@ -48,4 +48,4 @@ def convert(
 
 
 INT_WITH_MAX = AutoIntParamType()
-BOOL_WITH_AUTO = BooleanWithAutoParamType()
\ No newline at end of file
+BOOL_WITH_AUTO = BooleanWithAutoParamType()
diff --git a/src/together/constants.py b/src/together/constants.py
index 3e3399e5..3d34083d 100644
--- a/src/together/constants.py
+++ b/src/together/constants.py
@@ -51,4 +51,4 @@ class DatasetFormat(enum.Enum):
     DatasetFormat.INSTRUCTION: ["prompt", "completion"],
 }
 REQUIRED_COLUMNS_CONVERSATION = ["role", "content"]
-POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"]
\ No newline at end of file
+POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"]
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
index e6d3c350..40de2a1d 100644
--- a/src/together/resources/finetune.py
+++ b/src/together/resources/finetune.py
@@ -43,6 +43,7 @@ def createFinetuneRequest(
     lora_trainable_modules: str | None = "all-linear",
     suffix: str | None = None,
     wandb_api_key: str | None = None,
+    train_on_inputs: bool | Literal["auto"] = "auto",
 ) -> FinetuneRequest:
     if batch_size == "max":
         log_warn_once(
@@ -95,6 +96,7 @@ def createFinetuneRequest(
         training_type=training_type,
         suffix=suffix,
         wandb_key=wandb_api_key,
+        train_on_inputs=train_on_inputs,
     )
 
     return finetune_request
@@ -439,7 +441,7 @@ async def create(
         wandb_api_key: str | None = None,
         verbose: bool = False,
         model_limits: FinetuneTrainingLimits | None = None,
-        train_on_inputs: bool | Literal["auto"] = "auto"
+        train_on_inputs: bool | Literal["auto"] = "auto",
     ) -> FinetuneResponse:
         """
         Async method to initiate a fine-tuning job
diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index 5e7bdeae..fd0c99c4 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -16,7 +16,7 @@
     JSONL_REQUIRED_COLUMNS_MAP,
     REQUIRED_COLUMNS_CONVERSATION,
     POSSIBLE_ROLES_CONVERSATION,
-    DatasetFormat
+    DatasetFormat,
 )
 
 
@@ -111,7 +111,10 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
 
                 current_format = None
                 for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
-                    if all(column in json_line for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]):
+                    if all(
+                        column in json_line
+                        for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
+                    ):
                         if current_format is None:
                             current_format = possible_format
                         elif current_format != possible_format:
@@ -148,7 +151,9 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                         raise KeyError
 
                 if dataset_format == DatasetFormat.CONVERSATION:
-                    message_column = JSONL_REQUIRED_COLUMNS_MAP[DatasetFormat.CONVERSATION][0]
+                    message_column = JSONL_REQUIRED_COLUMNS_MAP[
+                        DatasetFormat.CONVERSATION
+                    ][0]
                     if not isinstance(json_line[message_column], list):
                         report_dict["key_value"] = False
                         report_dict["message"] = (
@@ -183,7 +188,9 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                             )
                             raise KeyError
 
-                    is_user_turn = [turn["role"] == "user" for turn in json_line["messages"]]
+                    is_user_turn = [
+                        turn["role"] == "user" for turn in json_line["messages"]
+                    ]
                     if any(i == j for i, j in zip(is_user_turn[1:], is_user_turn[:1])):
                         report_dict["key_value"] = False
                         report_dict["message"] = (
@@ -198,7 +205,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                             report_dict["key_value"] = False
                             report_dict["message"] = (
                                 f'Invalid value type for "{column}" key on line {idx + 1}. '
-                                f'Expected string. Found {type(json_line[column])}.'
+                                f"Expected string. Found {type(json_line[column])}."
                             )
                             raise KeyError
 
diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py
index a8b80131..2662c6b1 100644
--- a/tests/unit/test_files_checks.py
+++ b/tests/unit/test_files_checks.py
@@ -6,14 +6,11 @@
 def test_check_jsonl_valid_general(tmp_path: Path):
     # Create a valid JSONL file
     file = tmp_path / "valid.jsonl"
-    content = [
-        '{"text": "Hello, world!"}',
-        '{"text": "How are you?"}'
-    ]
+    content = ['{"text": "Hello, world!"}', '{"text": "How are you?"}']
     file.write_text("\n".join(content), encoding="utf-8")
-    
+
     report = check_file(file)
-    
+
     assert report["is_check_passed"]
     assert report["utf8"]
     assert report["num_samples"] == 2
@@ -25,12 +22,12 @@ def test_check_jsonl_valid_instruction(tmp_path: Path):
     file = tmp_path / "valid_instruction.jsonl"
     content = [
         '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}',
-        '{"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."}'
+        '{"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."}',
     ]
     file.write_text("\n".join(content), encoding="utf-8")
-    
+
     report = check_file(file)
-    
+
     assert report["is_check_passed"]
     assert report["utf8"]
     assert report["num_samples"] == 2
@@ -42,14 +39,14 @@ def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path):
     file = tmp_path / "valid_conversational_single_turn.jsonl"
     content = [
         '{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]}',
-        '{"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]}'
+        '{"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]}',
     ]
     file.write_text("\n".join(content), encoding="utf-8")
-    
+
     report = check_file(file)
-    
+
     print(report)
-    
+
     assert report["is_check_passed"]
     assert report["utf8"]
     assert report["num_samples"] == 2
@@ -61,12 +58,12 @@ def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path):
     file = tmp_path / "valid_conversational_multiple_turns.jsonl"
     content = [
         '{"messages": [{"role": "user", "content": "Is it going to rain today?"}, {"role": "assistant", "content": "Yes, expect showers in the afternoon."}]}',
-        '{"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}]}'
+        '{"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}]}',
     ]
     file.write_text("\n".join(content), encoding="utf-8")
-    
+
     report = check_file(file)
-    
+
     assert report["is_check_passed"]
     assert report["utf8"]
     assert report["num_samples"] == 2
@@ -77,11 +74,11 @@ def test_check_jsonl_empty_file(tmp_path: Path):
     # Create an empty JSONL file
     file = tmp_path / "empty.jsonl"
     file.touch()
-    
+
     report = check_file(file)
-    
+
     print(report)
-    
+
     assert not report["is_check_passed"]
     assert report["message"] == "File is empty"
     assert report["file_size"] == 0
@@ -90,10 +87,10 @@ def test_check_jsonl_empty_file(tmp_path: Path):
 def test_check_jsonl_non_utf8(tmp_path: Path):
     # Create a non-UTF-8 encoded JSONL file
     file = tmp_path / "non_utf8.jsonl"
-    file.write_bytes(b'\xff\xfe\xfd')
-    
+    file.write_bytes(b"\xff\xfe\xfd")
+
     report = check_file(file)
-    
+
     assert not report["is_check_passed"]
     assert not report["utf8"]
     assert "File is not UTF-8 encoded." in report["message"]
@@ -102,14 +99,11 @@ def test_check_jsonl_non_utf8(tmp_path: Path):
 def test_check_jsonl_invalid_json(tmp_path: Path):
     # Create a JSONL file with invalid JSON
     file = tmp_path / "invalid_json.jsonl"
-    content = [
-        '{"text": "Hello, world!"}',
-        'Invalid JSON Line'
-    ]
+    content = ['{"text": "Hello, world!"}', "Invalid JSON Line"]
     file.write_text("\n".join(content), encoding="utf-8")
-    
+
     report = check_file(file)
-    
+
     assert not report["is_check_passed"]
     assert "Error parsing json payload" in report["message"]
 
@@ -119,12 +113,12 @@ def test_check_jsonl_missing_required_field(tmp_path: Path):
     file = tmp_path / "missing_field.jsonl"
     content = [
         '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}',
-        '{"prompt": "Summarize the text."}'
+        '{"prompt": "Summarize the text."}',
     ]
     file.write_text("\n".join(content), encoding="utf-8")
-    
+
     report = check_file(file)
-    
+
     assert not report["is_check_passed"]
     assert "Missing 'completion' field was found on line 2" in report["message"]
 
@@ -134,26 +128,27 @@ def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path):
     file = tmp_path / "inconsistent_format.jsonl"
     content = [
         '{"messages": [{"role": "user", "content": "Hi"}]}',
-        '{"text": "How are you?"}'  # Missing 'messages'
+        '{"text": "How are you?"}',  # Missing 'messages'
     ]
     file.write_text("\n".join(content), encoding="utf-8")
-    
+
     report = check_file(file)
-    
+
     assert not report["is_check_passed"]
-    assert "All samples in the dataset must have the same dataset format" in report["message"]
+    assert (
+        "All samples in the dataset must have the same dataset format"
+        in report["message"]
+    )
 
 
 def test_check_jsonl_invalid_role(tmp_path: Path):
     # Create a JSONL file with an invalid role
     file = tmp_path / "invalid_role.jsonl"
-    content = [
-        '{"messages": [{"role": "invalid_role", "content": "Hi"}]}'
-    ]
+    content = ['{"messages": [{"role": "invalid_role", "content": "Hi"}]}']
     file.write_text("\n".join(content), encoding="utf-8")
-    
+
     report = check_file(file)
-    
+
     assert not report["is_check_passed"]
     assert "Found invalid role 'invalid_role'" in report["message"]
 
@@ -165,8 +160,8 @@ def test_check_jsonl_non_alternating_roles(tmp_path: Path):
         '{"messages": [{"role": "user", "content": "Hi"}, {"role": "user", "content": "Hello again"}]}'
     ]
     file.write_text("\n".join(content), encoding="utf-8")
-    
+
     report = check_file(file)
-    
+
     assert not report["is_check_passed"]
     assert "Invalid role turns" in report["message"]

From a5d666a14cf8a2d08a88ef12b1eb62fe5c8db4e4 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Thu, 7 Nov 2024 14:00:03 +0100
Subject: [PATCH 05/21] PR feedback

---
 src/together/cli/api/finetune.py |   4 +-
 src/together/cli/api/utils.py    |   4 +-
 src/together/constants.py        |   8 +--
 src/together/utils/files.py      | 107 ++++++++++++++++---------------
 tests/unit/test_files_checks.py  |  80 +++++++++++++----------
 5 files changed, 106 insertions(+), 97 deletions(-)

diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
index fec29a8a..4d4896b2 100644
--- a/src/together/cli/api/finetune.py
+++ b/src/together/cli/api/finetune.py
@@ -96,8 +96,8 @@ def fine_tuning(ctx: click.Context) -> None:
 @click.option(
     "--train-on-inputs",
     type=BOOL_WITH_AUTO,
-    default=True,
-    help="Whether to mask the inputs in conversational data",
+    default="auto",
+    help="Whether to mask the user messages in conversational data or prompts in instruction data",
 )
 def create(
     ctx: click.Context,
diff --git a/src/together/cli/api/utils.py b/src/together/cli/api/utils.py
index 7f80a68b..08dfe492 100644
--- a/src/together/cli/api/utils.py
+++ b/src/together/cli/api/utils.py
@@ -39,8 +39,8 @@ def convert(
             return bool(value)
         except ValueError:
             self.fail(
-                _("{value!r} is not a valid {boolean_type}.").format(
-                    value=value, boolean_type=self.name
+                _("{value!r} is not a valid {type}.").format(
+                    value=value, type=self.name
                 ),
                 param,
                 ctx,
diff --git a/src/together/constants.py b/src/together/constants.py
index 3d34083d..c64af326 100644
--- a/src/together/constants.py
+++ b/src/together/constants.py
@@ -34,11 +34,7 @@
 
 
 class DatasetFormat(enum.Enum):
-    """Dataset format enum.
-
-    Args:
-        enum (enum.Enum): Enum class for dataset format.
-    """
+    """Dataset format enum."""
 
     GENERAL = "general"
     CONVERSATION = "conversation"
@@ -50,5 +46,5 @@ class DatasetFormat(enum.Enum):
     DatasetFormat.CONVERSATION: ["messages"],
     DatasetFormat.INSTRUCTION: ["prompt", "completion"],
 }
-REQUIRED_COLUMNS_CONVERSATION = ["role", "content"]
+REQUIRED_COLUMNS_MESSAGE = ["role", "content"]
 POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"]
diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index fd0c99c4..a3537ba8 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -14,12 +14,19 @@
     NUM_BYTES_IN_GB,
     PARQUET_EXPECTED_COLUMNS,
     JSONL_REQUIRED_COLUMNS_MAP,
-    REQUIRED_COLUMNS_CONVERSATION,
+    REQUIRED_COLUMNS_MESSAGE,
     POSSIBLE_ROLES_CONVERSATION,
     DatasetFormat,
 )
 
 
+class InvalidFileFormatError(Exception):
+    """Exception raised for invalid file formats during file checks."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+
 def check_file(
     file: Path | str,
 ) -> Dict[str, Any]:
@@ -108,9 +115,11 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                     )
 
                     report_dict["is_check_passed"] = False
+                    raise InvalidFileFormatError
 
                 current_format = None
                 for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
+                    # Check if every column in the data contains in
                     if all(
                         column in json_line
                         for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
@@ -122,35 +131,16 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                                 "Found multiple dataset formats in the input file. "
                                 f"Got {current_format} and {possible_format} on line {idx + 1}."
                             )
-                            raise KeyError
-                if current_format is None and dataset_format is None:
+                            raise InvalidFileFormatError
+
+                if current_format is None:
                     report_dict["message"] = (
-                        "Error parsing file. Could not detect a possible format for the line with the columns:\n"
+                        f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n"
                         f"{json_line.keys()}"
                     )
-                    raise KeyError
+                    raise InvalidFileFormatError
 
-                if dataset_format is None:
-                    dataset_format = current_format
-                elif current_format is not None:
-                    if current_format != dataset_format:
-                        report_dict["message"] = (
-                            "All samples in the dataset must have the same dataset format. "
-                            f"Got {dataset_format} for the first line and {current_format} "
-                            f"for the {idx + 1} line."
-                        )
-                        raise KeyError
-
-                for column in JSONL_REQUIRED_COLUMNS_MAP[dataset_format]:
-                    if column not in json_line.keys():
-                        report_dict["text_field"] = False
-                        report_dict["message"] = (
-                            f"Missing '{column}' field was found on line {idx + 1} of the the input file."
-                        )
-                        report_dict["is_check_passed"] = False
-                        raise KeyError
-
-                if dataset_format == DatasetFormat.CONVERSATION:
+                if current_format == DatasetFormat.CONVERSATION:
                     message_column = JSONL_REQUIRED_COLUMNS_MAP[
                         DatasetFormat.CONVERSATION
                     ][0]
@@ -158,56 +148,69 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                         report_dict["key_value"] = False
                         report_dict["message"] = (
                             f"Invalid format on line {idx + 1} of the input file. "
-                            f"Expected list. Found {type(json_line[message_column])}"
+                            f"Expected a list of messages. Found {type(json_line[message_column])}"
                         )
-                        raise KeyError
-                    for column in REQUIRED_COLUMNS_CONVERSATION:
-                        for turn in json_line[message_column]:
-                            if column not in turn.keys():
+                        raise InvalidFileFormatError
+
+                    previous_role = ""
+                    for turn in json_line[message_column]:
+                        for column in REQUIRED_COLUMNS_MESSAGE:
+                            if column not in turn:
                                 report_dict["key_value"] = False
                                 report_dict["message"] = (
-                                    f"Missing '{column}' in a turn was found on line {idx + 1} of the the input file."
+                                    f"Field '{column}' is missing for a turn `{turn}` on line {idx + 1} "
+                                    "of the the input file."
                                 )
-                                raise KeyError
+                                raise InvalidFileFormatError
                             else:
                                 if not isinstance(turn[column], str):
                                     report_dict["text_field"] = False
                                     report_dict["message"] = (
-                                        f"Invalid format on line {idx + 1} in the column {column} of the input file. "
-                                        f"Expected string. found {type(turn[column])}"
+                                        f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` "
+                                        f"of the input file. Expected string. Found {type(turn[column])}"
                                     )
-                                    raise KeyError
+                                    raise InvalidFileFormatError
+                        role = turn["role"]
 
-                    roles = set(turn["role"] for turn in json_line["messages"])
-                    for role in roles:
                         if role not in POSSIBLE_ROLES_CONVERSATION:
                             report_dict["key_value"] = False
                             report_dict["message"] = (
                                 f"Found invalid role '{role}' in the messages on the line {idx + 1}. "
                                 f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}"
                             )
-                            raise KeyError
+                            raise InvalidFileFormatError
+
+                        if previous_role == role:
+                            report_dict["key_value"] = False
+                            report_dict["message"] = (
+                                f"Invalid role turns on line {idx + 1} of the input file. "
+                                "'user' and 'assistant' roles must alternate user/assistant/user/assistant/..."
+                            )
+                            raise InvalidFileFormatError
+
+                        previous_role = role
 
-                    is_user_turn = [
-                        turn["role"] == "user" for turn in json_line["messages"]
-                    ]
-                    if any(i == j for i, j in zip(is_user_turn[1:], is_user_turn[:1])):
-                        report_dict["key_value"] = False
-                        report_dict["message"] = (
-                            f"Invalid role turns on line {idx + 1} of the input file. "
-                            "'user' and 'assistant' roles must alternate user/assistant/user/assistant/..."
-                        )
-                        raise KeyError
                 else:
                     # check to make sure the value of the keys is a string
-                    for column in JSONL_REQUIRED_COLUMNS_MAP[dataset_format]:
+                    for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
                         if not isinstance(json_line[column], str):
                             report_dict["key_value"] = False
                             report_dict["message"] = (
                                 f'Invalid value type for "{column}" key on line {idx + 1}. '
                                 f"Expected string. Found {type(json_line[column])}."
                             )
-                            raise KeyError
+                            raise InvalidFileFormatError
+
+                if dataset_format is None:
+                    dataset_format = current_format
+                elif current_format is not None:
+                    if current_format != dataset_format:
+                        report_dict["message"] = (
+                            "All samples in the dataset must have the same dataset format. "
+                            f"Got {dataset_format} for the first line and {current_format} "
+                            f"for the line {idx + 1}."
+                        )
+                        raise InvalidFileFormatError
 
             # make sure this is outside the for idx, line in enumerate(f): for loop
             if idx + 1 < MIN_SAMPLES:
@@ -236,7 +239,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                     f"Error parsing json payload. Unexpected format on line {idx + 1}."
                 )
             report_dict["is_check_passed"] = False
-        except KeyError as _:
+        except InvalidFileFormatError:
             report_dict["load_json"] = False
             report_dict["is_check_passed"] = False
 
diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py
index 2662c6b1..f83533bc 100644
--- a/tests/unit/test_files_checks.py
+++ b/tests/unit/test_files_checks.py
@@ -1,73 +1,79 @@
+import json
 import pytest
 from pathlib import Path
+
+from together.constants import MIN_SAMPLES
 from together.utils.files import check_file
 
 
 def test_check_jsonl_valid_general(tmp_path: Path):
     # Create a valid JSONL file
     file = tmp_path / "valid.jsonl"
-    content = ['{"text": "Hello, world!"}', '{"text": "How are you?"}']
-    file.write_text("\n".join(content), encoding="utf-8")
+    content = [{"text": "Hello, world!"}, {"text": "How are you?"}]
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
 
     report = check_file(file)
 
     assert report["is_check_passed"]
     assert report["utf8"]
-    assert report["num_samples"] == 2
-    assert report["min_samples"]
+    assert report["num_samples"] == len(content)
+    assert report["min_samples"] >= MIN_SAMPLES
 
 
 def test_check_jsonl_valid_instruction(tmp_path: Path):
     # Create a valid JSONL file with instruction format
     file = tmp_path / "valid_instruction.jsonl"
     content = [
-        '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}',
-        '{"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."}',
+        {"prompt": "Translate the following sentence.", "completion": "Hello, world!"},
+        {"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."},
     ]
-    file.write_text("\n".join(content), encoding="utf-8")
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
 
     report = check_file(file)
 
     assert report["is_check_passed"]
     assert report["utf8"]
-    assert report["num_samples"] == 2
-    assert report["min_samples"]
+    assert report["num_samples"] == len(content)
+    assert report["min_samples"] >= MIN_SAMPLES
 
 
 def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path):
     # Create a valid JSONL file with conversational format and 1 user-assistant turn pair
     file = tmp_path / "valid_conversational_single_turn.jsonl"
     content = [
-        '{"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]}',
-        '{"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]}',
+        {"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]},
+        {"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]},
     ]
-    file.write_text("\n".join(content), encoding="utf-8")
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
 
     report = check_file(file)
 
-    print(report)
 
     assert report["is_check_passed"]
     assert report["utf8"]
-    assert report["num_samples"] == 2
-    assert report["min_samples"]
+    assert report["num_samples"] == len(content)
+    assert report["min_samples"] >= MIN_SAMPLES
 
 
 def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path):
     # Create a valid JSONL file with conversational format and multiple user-assistant turn pairs
     file = tmp_path / "valid_conversational_multiple_turns.jsonl"
     content = [
-        '{"messages": [{"role": "user", "content": "Is it going to rain today?"}, {"role": "assistant", "content": "Yes, expect showers in the afternoon."}]}',
-        '{"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}]}',
+        {"messages": [{"role": "user", "content": "Is it going to rain today?"}, {"role": "assistant", "content": "Yes, expect showers in the afternoon."}, {"role": "user", "content": "What is the weather like in Tokyo?"}, {"role": "assistant", "content": "It is sunny with a chance of rain."}]},
+        {"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}, {"role": "user", "content": "What is the weather like in Amsterdam?"}, {"role": "assistant", "content": "It is cloudy with a chance of snow."}]},
     ]
-    file.write_text("\n".join(content), encoding="utf-8")
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
 
     report = check_file(file)
 
     assert report["is_check_passed"]
     assert report["utf8"]
-    assert report["num_samples"] == 2
-    assert report["min_samples"]
+    assert report["num_samples"] == len(content)
+    assert report["min_samples"] >= MIN_SAMPLES
 
 
 def test_check_jsonl_empty_file(tmp_path: Path):
@@ -77,7 +83,6 @@ def test_check_jsonl_empty_file(tmp_path: Path):
 
     report = check_file(file)
 
-    print(report)
 
     assert not report["is_check_passed"]
     assert report["message"] == "File is empty"
@@ -99,38 +104,41 @@ def test_check_jsonl_non_utf8(tmp_path: Path):
 def test_check_jsonl_invalid_json(tmp_path: Path):
     # Create a JSONL file with invalid JSON
     file = tmp_path / "invalid_json.jsonl"
-    content = ['{"text": "Hello, world!"}', "Invalid JSON Line"]
-    file.write_text("\n".join(content), encoding="utf-8")
+    content = [{"text": "Hello, world!"}, "Invalid JSON Line"]
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
 
     report = check_file(file)
 
     assert not report["is_check_passed"]
-    assert "Error parsing json payload" in report["message"]
+    assert "Error parsing file." in report["message"]
 
 
 def test_check_jsonl_missing_required_field(tmp_path: Path):
     # Create a JSONL file missing a required field
     file = tmp_path / "missing_field.jsonl"
     content = [
-        '{"prompt": "Translate the following sentence.", "completion": "Hello, world!"}',
-        '{"prompt": "Summarize the text."}',
+        {"prompt": "Translate the following sentence.", "completion": "Hello, world!"},
+        {"prompt": "Summarize the text."},
     ]
-    file.write_text("\n".join(content), encoding="utf-8")
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
 
     report = check_file(file)
 
     assert not report["is_check_passed"]
-    assert "Missing 'completion' field was found on line 2" in report["message"]
+    assert "Error parsing file. Could not detect a format for the line 2" in report["message"]
 
 
 def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path):
     # Create a JSONL file with inconsistent dataset formats
     file = tmp_path / "inconsistent_format.jsonl"
     content = [
-        '{"messages": [{"role": "user", "content": "Hi"}]}',
-        '{"text": "How are you?"}',  # Missing 'messages'
+        {"messages": [{"role": "user", "content": "Hi"}]},
+        {"text": "How are you?"},  # Missing 'messages'
     ]
-    file.write_text("\n".join(content), encoding="utf-8")
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
 
     report = check_file(file)
 
@@ -144,8 +152,9 @@ def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path):
 def test_check_jsonl_invalid_role(tmp_path: Path):
     # Create a JSONL file with an invalid role
     file = tmp_path / "invalid_role.jsonl"
-    content = ['{"messages": [{"role": "invalid_role", "content": "Hi"}]}']
-    file.write_text("\n".join(content), encoding="utf-8")
+    content = [{"messages": [{"role": "invalid_role", "content": "Hi"}]}]
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
 
     report = check_file(file)
 
@@ -157,9 +166,10 @@ def test_check_jsonl_non_alternating_roles(tmp_path: Path):
     # Create a JSONL file with non-alternating user/assistant roles
     file = tmp_path / "non_alternating_roles.jsonl"
     content = [
-        '{"messages": [{"role": "user", "content": "Hi"}, {"role": "user", "content": "Hello again"}]}'
+        {"messages": [{"role": "user", "content": "Hi"}, {"role": "user", "content": "Hello again"}]}
     ]
-    file.write_text("\n".join(content), encoding="utf-8")
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
 
     report = check_file(file)
 

From ff47c0266edad7a634e4d9d02e9458ae359b4327 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Thu, 7 Nov 2024 14:03:21 +0100
Subject: [PATCH 06/21] style

---
 tests/unit/test_files_checks.py | 49 +++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py
index f83533bc..6b497292 100644
--- a/tests/unit/test_files_checks.py
+++ b/tests/unit/test_files_checks.py
@@ -43,15 +43,24 @@ def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path):
     # Create a valid JSONL file with conversational format and 1 user-assistant turn pair
     file = tmp_path / "valid_conversational_single_turn.jsonl"
     content = [
-        {"messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]},
-        {"messages": [{"role": "user", "content": "How are you?"}, {"role": "assistant", "content": "I am fine."}]},
+        {
+            "messages": [
+                {"role": "user", "content": "Hello"},
+                {"role": "assistant", "content": "Hi there!"},
+            ]
+        },
+        {
+            "messages": [
+                {"role": "user", "content": "How are you?"},
+                {"role": "assistant", "content": "I am fine."},
+            ]
+        },
     ]
     with file.open("w") as f:
         f.write("\n".join([json.dumps(item) for item in content]))
 
     report = check_file(file)
 
-
     assert report["is_check_passed"]
     assert report["utf8"]
     assert report["num_samples"] == len(content)
@@ -62,8 +71,25 @@ def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path):
     # Create a valid JSONL file with conversational format and multiple user-assistant turn pairs
     file = tmp_path / "valid_conversational_multiple_turns.jsonl"
     content = [
-        {"messages": [{"role": "user", "content": "Is it going to rain today?"}, {"role": "assistant", "content": "Yes, expect showers in the afternoon."}, {"role": "user", "content": "What is the weather like in Tokyo?"}, {"role": "assistant", "content": "It is sunny with a chance of rain."}]},
-        {"messages": [{"role": "user", "content": "Who won the game last night?"}, {"role": "assistant", "content": "The home team won by two points."}, {"role": "user", "content": "What is the weather like in Amsterdam?"}, {"role": "assistant", "content": "It is cloudy with a chance of snow."}]},
+        {
+            "messages": [
+                {"role": "user", "content": "Is it going to rain today?"},
+                {
+                    "role": "assistant",
+                    "content": "Yes, expect showers in the afternoon.",
+                },
+                {"role": "user", "content": "What is the weather like in Tokyo?"},
+                {"role": "assistant", "content": "It is sunny with a chance of rain."},
+            ]
+        },
+        {
+            "messages": [
+                {"role": "user", "content": "Who won the game last night?"},
+                {"role": "assistant", "content": "The home team won by two points."},
+                {"role": "user", "content": "What is the weather like in Amsterdam?"},
+                {"role": "assistant", "content": "It is cloudy with a chance of snow."},
+            ]
+        },
     ]
     with file.open("w") as f:
         f.write("\n".join([json.dumps(item) for item in content]))
@@ -83,7 +109,6 @@ def test_check_jsonl_empty_file(tmp_path: Path):
 
     report = check_file(file)
 
-
     assert not report["is_check_passed"]
     assert report["message"] == "File is empty"
     assert report["file_size"] == 0
@@ -127,7 +152,10 @@ def test_check_jsonl_missing_required_field(tmp_path: Path):
     report = check_file(file)
 
     assert not report["is_check_passed"]
-    assert "Error parsing file. Could not detect a format for the line 2" in report["message"]
+    assert (
+        "Error parsing file. Could not detect a format for the line 2"
+        in report["message"]
+    )
 
 
 def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path):
@@ -166,7 +194,12 @@ def test_check_jsonl_non_alternating_roles(tmp_path: Path):
     # Create a JSONL file with non-alternating user/assistant roles
     file = tmp_path / "non_alternating_roles.jsonl"
     content = [
-        {"messages": [{"role": "user", "content": "Hi"}, {"role": "user", "content": "Hello again"}]}
+        {
+            "messages": [
+                {"role": "user", "content": "Hi"},
+                {"role": "user", "content": "Hello again"},
+            ]
+        }
     ]
     with file.open("w") as f:
         f.write("\n".join([json.dumps(item) for item in content]))

From 8a6b63dc84ddc47b0778b0d6a5b1280207527148 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Thu, 7 Nov 2024 15:37:08 +0100
Subject: [PATCH 07/21] more tests

---
 src/together/utils/files.py     |  2 +-
 tests/unit/test_files_checks.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index a3537ba8..091ff15f 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -227,7 +227,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
 
             report_dict["load_json"] = True
 
-        except ValueError as _:
+        except ValueError:
             report_dict["load_json"] = False
             if idx < 0:
                 report_dict["message"] = (
diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py
index 6b497292..7cfdac3e 100644
--- a/tests/unit/test_files_checks.py
+++ b/tests/unit/test_files_checks.py
@@ -208,3 +208,33 @@ def test_check_jsonl_non_alternating_roles(tmp_path: Path):
 
     assert not report["is_check_passed"]
     assert "Invalid role turns" in report["message"]
+
+
+def test_check_jsonl_invalid_value_type(tmp_path: Path):
+    # Create a JSONL file with an invalid value type
+    file = tmp_path / "invalid_value_type.jsonl"
+    content = [{"text": 123}]
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
+
+    report = check_file(file)
+    assert not report["is_check_passed"]
+    assert "Expected string" in report["message"]
+
+
+def test_check_jsonl_missing_field_in_conversation(tmp_path: Path):
+    file = tmp_path / "missing_field_in_conversation.jsonl"
+    content = [
+        {
+            "messages": [
+                {"role": "user", "content": "Hi"},
+                {"role": "assistant"},
+            ]
+        }
+    ]
+    with file.open("w") as f:
+        f.write("\n".join([json.dumps(item) for item in content]))
+
+    report = check_file(file)
+    assert not report["is_check_passed"]
+    assert "Field 'content' is missing for a turn" in report["message"]

From ad9d0a877aa195972331262254144143b5827ba0 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Thu, 7 Nov 2024 15:56:29 +0100
Subject: [PATCH 08/21] enhance logic

---
 src/together/resources/finetune.py |   3 +-
 src/together/utils/files.py        | 117 +++++++++++++++++------------
 2 files changed, 69 insertions(+), 51 deletions(-)

diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
index 40de2a1d..0148b50f 100644
--- a/src/together/resources/finetune.py
+++ b/src/together/resources/finetune.py
@@ -157,7 +157,8 @@ def create(
                 Defaults to False.
             model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
                 Defaults to None.
-            train_on_inputs (bool, optional): Whether to mask the inputs in conversational data. Defaults to "auto".
+            train_on_inputs (bool, optional): Whether to mask the user messages in conversational data or prompts in instruction data.
+                Defaults to "auto".
 
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index 091ff15f..6a45511c 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -23,8 +23,16 @@
 class InvalidFileFormatError(Exception):
     """Exception raised for invalid file formats during file checks."""
 
-    def __init__(self) -> None:
-        super().__init__()
+    def __init__(
+        self,
+        message: str = "",
+        line_number: int | None = None,
+        field: str | None = None,
+    ) -> None:
+        super().__init__(message)
+        self.message = message
+        self.line_number = line_number
+        self.field = field
 
 
 def check_file(
@@ -101,25 +109,23 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
 
     dataset_format = None
     with file.open() as f:
-        # idx must be instantiated so decode errors (e.g. file is a tar) or empty files are caught
         idx = -1
         try:
             for idx, line in enumerate(f):
-                json_line = json.loads(line)  # each line in jsonlines should be a json
+                json_line = json.loads(line)
 
                 if not isinstance(json_line, dict):
-                    report_dict["line_type"] = False
-                    report_dict["message"] = (
-                        f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
-                        'Example of valid json: {"text": "my sample string"}. '
+                    raise InvalidFileFormatError(
+                        message=(
+                            f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
+                            'Example of valid json: {"text": "my sample string"}. '
+                        ),
+                        line_number=idx + 1,
+                        field="line_type",
                     )
 
-                    report_dict["is_check_passed"] = False
-                    raise InvalidFileFormatError
-
                 current_format = None
                 for possible_format in JSONL_REQUIRED_COLUMNS_MAP:
-                    # Check if every column in the data contains in
                     if all(
                         column in json_line
                         for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
@@ -127,71 +133,72 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                         if current_format is None:
                             current_format = possible_format
                         elif current_format != possible_format:
-                            report_dict["message"] = (
-                                "Found multiple dataset formats in the input file. "
-                                f"Got {current_format} and {possible_format} on line {idx + 1}."
+                            raise InvalidFileFormatError(
+                                message="Found multiple dataset formats in the input file. "
+                                f"Got {current_format} and {possible_format} on line {idx + 1}.",
+                                line_number=idx + 1,
                             )
-                            raise InvalidFileFormatError
 
                 if current_format is None:
-                    report_dict["message"] = (
-                        f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n"
-                        f"{json_line.keys()}"
+                    raise InvalidFileFormatError(
+                        message=(
+                            f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n"
+                            f"{json_line.keys()}"
+                        ),
+                        line_number=idx + 1,
                     )
-                    raise InvalidFileFormatError
 
                 if current_format == DatasetFormat.CONVERSATION:
                     message_column = JSONL_REQUIRED_COLUMNS_MAP[
                         DatasetFormat.CONVERSATION
                     ][0]
                     if not isinstance(json_line[message_column], list):
-                        report_dict["key_value"] = False
-                        report_dict["message"] = (
-                            f"Invalid format on line {idx + 1} of the input file. "
-                            f"Expected a list of messages. Found {type(json_line[message_column])}"
+                        raise InvalidFileFormatError(
+                            message=f"Invalid format on line {idx + 1} of the input file. "
+                            f"Expected a list of messages. Found {type(json_line[message_column])}",
+                            line_number=idx + 1,
+                            field="key_value",
                         )
-                        raise InvalidFileFormatError
 
                     previous_role = ""
                     for turn in json_line[message_column]:
                         for column in REQUIRED_COLUMNS_MESSAGE:
                             if column not in turn:
-                                report_dict["key_value"] = False
-                                report_dict["message"] = (
-                                    f"Field '{column}' is missing for a turn `{turn}` on line {idx + 1} "
-                                    "of the the input file."
+                                raise InvalidFileFormatError(
+                                    message=f"Field '{column}' is missing for a turn `{turn}` on line {idx + 1} "
+                                    "of the the input file.",
+                                    line_number=idx + 1,
+                                    field="key_value",
                                 )
-                                raise InvalidFileFormatError
                             else:
                                 if not isinstance(turn[column], str):
-                                    report_dict["text_field"] = False
-                                    report_dict["message"] = (
-                                        f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` "
-                                        f"of the input file. Expected string. Found {type(turn[column])}"
+                                    raise InvalidFileFormatError(
+                                        message=f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` "
+                                        f"of the input file. Expected string. Found {type(turn[column])}",
+                                        line_number=idx + 1,
+                                        field="text_field",
                                     )
-                                    raise InvalidFileFormatError
                         role = turn["role"]
 
                         if role not in POSSIBLE_ROLES_CONVERSATION:
-                            report_dict["key_value"] = False
-                            report_dict["message"] = (
-                                f"Found invalid role '{role}' in the messages on the line {idx + 1}. "
-                                f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}"
+                            raise InvalidFileFormatError(
+                                message=f"Found invalid role '{role}' in the messages on the line {idx + 1}. "
+                                f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}",
+                                line_number=idx + 1,
+                                field="key_value",
                             )
-                            raise InvalidFileFormatError
 
                         if previous_role == role:
-                            report_dict["key_value"] = False
-                            report_dict["message"] = (
-                                f"Invalid role turns on line {idx + 1} of the input file. "
-                                "'user' and 'assistant' roles must alternate user/assistant/user/assistant/..."
+                            raise InvalidFileFormatError(
+                                message=f"Invalid role turns on line {idx + 1} of the input file. "
+                                "'user' and 'assistant' roles must alternate user/assistant/user/assistant/...",
+                                line_number=idx + 1,
+                                field="key_value",
                             )
-                            raise InvalidFileFormatError
 
                         previous_role = role
 
                 else:
-                    # check to make sure the value of the keys is a string
                     for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
                         if not isinstance(json_line[column], str):
                             report_dict["key_value"] = False
@@ -199,7 +206,11 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                                 f'Invalid value type for "{column}" key on line {idx + 1}. '
                                 f"Expected string. Found {type(json_line[column])}."
                             )
-                            raise InvalidFileFormatError
+                            raise InvalidFileFormatError(
+                                message=report_dict["message"],
+                                line_number=idx + 1,
+                                field="key_value",
+                            )
 
                 if dataset_format is None:
                     dataset_format = current_format
@@ -210,9 +221,10 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                             f"Got {dataset_format} for the first line and {current_format} "
                             f"for the line {idx + 1}."
                         )
-                        raise InvalidFileFormatError
+                        raise InvalidFileFormatError(
+                            message=report_dict["message"], line_number=idx + 1
+                        )
 
-            # make sure this is outside the for idx, line in enumerate(f): for loop
             if idx + 1 < MIN_SAMPLES:
                 report_dict["min_samples"] = False
                 report_dict["message"] = (
@@ -239,9 +251,14 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                     f"Error parsing json payload. Unexpected format on line {idx + 1}."
                 )
             report_dict["is_check_passed"] = False
-        except InvalidFileFormatError:
+        except InvalidFileFormatError as e:
             report_dict["load_json"] = False
             report_dict["is_check_passed"] = False
+            report_dict["message"] = e.message
+            if e.line_number is not None:
+                report_dict["line_number"] = e.line_number
+            if e.field is not None:
+                report_dict[e.field] = False
 
     if "text_field" not in report_dict:
         report_dict["text_field"] = True

From 487fbae7bd1f7c6df03f69608bb03f97a654c847 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Thu, 7 Nov 2024 15:58:51 +0100
Subject: [PATCH 09/21] enhance logic

---
 src/together/utils/files.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index 6a45511c..cc8c7b7d 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -201,13 +201,9 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                 else:
                     for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]:
                         if not isinstance(json_line[column], str):
-                            report_dict["key_value"] = False
-                            report_dict["message"] = (
-                                f'Invalid value type for "{column}" key on line {idx + 1}. '
-                                f"Expected string. Found {type(json_line[column])}."
-                            )
                             raise InvalidFileFormatError(
-                                message=report_dict["message"],
+                                message=f'Invalid value type for "{column}" key on line {idx + 1}. '
+                                f"Expected string. Found {type(json_line[column])}.",
                                 line_number=idx + 1,
                                 field="key_value",
                             )
@@ -216,13 +212,11 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                     dataset_format = current_format
                 elif current_format is not None:
                     if current_format != dataset_format:
-                        report_dict["message"] = (
-                            "All samples in the dataset must have the same dataset format. "
-                            f"Got {dataset_format} for the first line and {current_format} "
-                            f"for the line {idx + 1}."
-                        )
                         raise InvalidFileFormatError(
-                            message=report_dict["message"], line_number=idx + 1
+                            message="All samples in the dataset must have the same dataset format. "
+                            f"Got {dataset_format} for the first line and {current_format} "
+                            f"for the line {idx + 1}.",
+                            line_number=idx + 1,
                         )
 
             if idx + 1 < MIN_SAMPLES:

From a400517ee6a12b9fc4a43907f09a3efde7f8e031 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Fri, 8 Nov 2024 16:26:06 +0100
Subject: [PATCH 10/21] pr feedback part 1

---
 src/together/resources/finetune.py | 10 ++++---
 src/together/utils/files.py        | 42 ++++++++++++++++--------------
 tests/unit/test_files_checks.py    | 32 +++++++++++------------
 3 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
index 0148b50f..3c17c6ad 100644
--- a/src/together/resources/finetune.py
+++ b/src/together/resources/finetune.py
@@ -140,7 +140,7 @@ def create(
             n_evals (int, optional): Number of evaluation loops to run. Defaults to 0.
             n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning.
                 Defaults to 1.
-            batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
+            batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
             learning_rate (float, optional): Learning rate multiplier to use for training
                 Defaults to 0.00001.
             warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
@@ -157,7 +157,11 @@ def create(
                 Defaults to False.
             model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
                 Defaults to None.
-            train_on_inputs (bool, optional): Whether to mask the user messages in conversational data or prompts in instruction data.
+            train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data.
+                "auto" will automatically determine whether to mask the inputs based on the data format.
+                Dataset with "text" (General format) field will not mask the inputs by default.
+                Dataset with "messages" (Conversational format) or "prompt" and "completion" (Instruction format)
+                fields will mask the inputs by default.
                 Defaults to "auto".
 
         Returns:
@@ -472,7 +476,7 @@ async def create(
                 Defaults to False.
             model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
                 Defaults to None.
-            train_on_inputs (bool, optional): Whether to mask the inputs in conversational data. Defaults to "auto".
+            train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data.
 
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index cc8c7b7d..c382d3bb 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -20,19 +20,19 @@
 )
 
 
-class InvalidFileFormatError(Exception):
+class InvalidFileFormatError(ValueError):
     """Exception raised for invalid file formats during file checks."""
 
     def __init__(
         self,
         message: str = "",
         line_number: int | None = None,
-        field: str | None = None,
+        error_source: str | None = None,
     ) -> None:
         super().__init__(message)
         self.message = message
         self.line_number = line_number
-        self.field = field
+        self.error_source = error_source
 
 
 def check_file(
@@ -50,7 +50,7 @@ def check_file(
         "line_type": None,
         "text_field": None,
         "key_value": None,
-        "min_samples": None,
+        "has_min_samples": None,
         "num_samples": None,
         "load_json": None,
     }
@@ -121,7 +121,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                             'Example of valid json: {"text": "my sample string"}. '
                         ),
                         line_number=idx + 1,
-                        field="line_type",
+                        error_source="line_type",
                     )
 
                 current_format = None
@@ -137,6 +137,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                                 message="Found multiple dataset formats in the input file. "
                                 f"Got {current_format} and {possible_format} on line {idx + 1}.",
                                 line_number=idx + 1,
+                                error_source="format",
                             )
 
                 if current_format is None:
@@ -146,6 +147,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                             f"{json_line.keys()}"
                         ),
                         line_number=idx + 1,
+                        error_source="format",
                     )
 
                 if current_format == DatasetFormat.CONVERSATION:
@@ -157,7 +159,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                             message=f"Invalid format on line {idx + 1} of the input file. "
                             f"Expected a list of messages. Found {type(json_line[message_column])}",
                             line_number=idx + 1,
-                            field="key_value",
+                            error_source="key_value",
                         )
 
                     previous_role = ""
@@ -165,10 +167,10 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                         for column in REQUIRED_COLUMNS_MESSAGE:
                             if column not in turn:
                                 raise InvalidFileFormatError(
-                                    message=f"Field '{column}' is missing for a turn `{turn}` on line {idx + 1} "
+                                    message=f"Field `{column}` is missing for a turn `{turn}` on line {idx + 1} "
                                     "of the the input file.",
                                     line_number=idx + 1,
-                                    field="key_value",
+                                    error_source="key_value",
                                 )
                             else:
                                 if not isinstance(turn[column], str):
@@ -176,24 +178,24 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                                         message=f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` "
                                         f"of the input file. Expected string. Found {type(turn[column])}",
                                         line_number=idx + 1,
-                                        field="text_field",
+                                        error_source="text_field",
                                     )
                         role = turn["role"]
 
                         if role not in POSSIBLE_ROLES_CONVERSATION:
                             raise InvalidFileFormatError(
-                                message=f"Found invalid role '{role}' in the messages on the line {idx + 1}. "
+                                message=f"Found invalid role `{role}` in the messages on the line {idx + 1}. "
                                 f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}",
                                 line_number=idx + 1,
-                                field="key_value",
+                                error_source="key_value",
                             )
 
                         if previous_role == role:
                             raise InvalidFileFormatError(
                                 message=f"Invalid role turns on line {idx + 1} of the input file. "
-                                "'user' and 'assistant' roles must alternate user/assistant/user/assistant/...",
+                                "`user` and `assistant` roles must alternate user/assistant/user/assistant/...",
                                 line_number=idx + 1,
-                                field="key_value",
+                                error_source="key_value",
                             )
 
                         previous_role = role
@@ -205,7 +207,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                                 message=f'Invalid value type for "{column}" key on line {idx + 1}. '
                                 f"Expected string. Found {type(json_line[column])}.",
                                 line_number=idx + 1,
-                                field="key_value",
+                                error_source="key_value",
                             )
 
                 if dataset_format is None:
@@ -217,10 +219,11 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                             f"Got {dataset_format} for the first line and {current_format} "
                             f"for the line {idx + 1}.",
                             line_number=idx + 1,
+                            error_source="format",
                         )
 
             if idx + 1 < MIN_SAMPLES:
-                report_dict["min_samples"] = False
+                report_dict["has_min_samples"] = False
                 report_dict["message"] = (
                     f"Processing {file} resulted in only {idx + 1} samples. "
                     f"Our minimum is {MIN_SAMPLES} samples. "
@@ -228,7 +231,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                 report_dict["is_check_passed"] = False
             else:
                 report_dict["num_samples"] = idx + 1
-                report_dict["min_samples"] = True
+                report_dict["has_min_samples"] = True
                 report_dict["is_check_passed"] = True
 
             report_dict["load_json"] = True
@@ -251,8 +254,8 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
             report_dict["message"] = e.message
             if e.line_number is not None:
                 report_dict["line_number"] = e.line_number
-            if e.field is not None:
-                report_dict[e.field] = False
+            if e.error_source is not None:
+                report_dict[e.error_source] = False
 
     if "text_field" not in report_dict:
         report_dict["text_field"] = True
@@ -295,7 +298,8 @@ def _check_parquet(file: Path) -> Dict[str, Any]:
 
     num_samples = len(table)
     if num_samples < MIN_SAMPLES:
-        report_dict["min_samples"] = (
+        report_dict["has_min_samples"] = False
+        report_dict["message"] = (
             f"Processing {file} resulted in only {num_samples} samples. "
             f"Our minimum is {MIN_SAMPLES} samples. "
         )
diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py
index 7cfdac3e..0412be64 100644
--- a/tests/unit/test_files_checks.py
+++ b/tests/unit/test_files_checks.py
@@ -11,14 +11,14 @@ def test_check_jsonl_valid_general(tmp_path: Path):
     file = tmp_path / "valid.jsonl"
     content = [{"text": "Hello, world!"}, {"text": "How are you?"}]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
 
     assert report["is_check_passed"]
     assert report["utf8"]
     assert report["num_samples"] == len(content)
-    assert report["min_samples"] >= MIN_SAMPLES
+    assert report["has_min_samples"]
 
 
 def test_check_jsonl_valid_instruction(tmp_path: Path):
@@ -26,17 +26,17 @@ def test_check_jsonl_valid_instruction(tmp_path: Path):
     file = tmp_path / "valid_instruction.jsonl"
     content = [
         {"prompt": "Translate the following sentence.", "completion": "Hello, world!"},
-        {"prompt": "Summarize the text.", "completion": "OpenAI creates advanced AI."},
+        {"prompt": "Summarize the text.", "completion": "Weyland-Yutani Corporation creates advanced AI."},
     ]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
 
     assert report["is_check_passed"]
     assert report["utf8"]
     assert report["num_samples"] == len(content)
-    assert report["min_samples"] >= MIN_SAMPLES
+    assert report["has_min_samples"]
 
 
 def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path):
@@ -57,14 +57,14 @@ def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path):
         },
     ]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
 
     assert report["is_check_passed"]
     assert report["utf8"]
     assert report["num_samples"] == len(content)
-    assert report["min_samples"] >= MIN_SAMPLES
+    assert report["has_min_samples"]
 
 
 def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path):
@@ -92,14 +92,14 @@ def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path):
         },
     ]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
 
     assert report["is_check_passed"]
     assert report["utf8"]
     assert report["num_samples"] == len(content)
-    assert report["min_samples"] >= MIN_SAMPLES
+    assert report["has_min_samples"]
 
 
 def test_check_jsonl_empty_file(tmp_path: Path):
@@ -131,7 +131,7 @@ def test_check_jsonl_invalid_json(tmp_path: Path):
     file = tmp_path / "invalid_json.jsonl"
     content = [{"text": "Hello, world!"}, "Invalid JSON Line"]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
 
@@ -147,7 +147,7 @@ def test_check_jsonl_missing_required_field(tmp_path: Path):
         {"prompt": "Summarize the text."},
     ]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
 
@@ -166,7 +166,7 @@ def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path):
         {"text": "How are you?"},  # Missing 'messages'
     ]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
 
@@ -182,7 +182,7 @@ def test_check_jsonl_invalid_role(tmp_path: Path):
     file = tmp_path / "invalid_role.jsonl"
     content = [{"messages": [{"role": "invalid_role", "content": "Hi"}]}]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
 
@@ -202,7 +202,7 @@ def test_check_jsonl_non_alternating_roles(tmp_path: Path):
         }
     ]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
 
@@ -215,7 +215,7 @@ def test_check_jsonl_invalid_value_type(tmp_path: Path):
     file = tmp_path / "invalid_value_type.jsonl"
     content = [{"text": 123}]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
     assert not report["is_check_passed"]
@@ -233,7 +233,7 @@ def test_check_jsonl_missing_field_in_conversation(tmp_path: Path):
         }
     ]
     with file.open("w") as f:
-        f.write("\n".join([json.dumps(item) for item in content]))
+        f.write("\n".join(json.dumps(item) for item in content))
 
     report = check_file(file)
     assert not report["is_check_passed"]

From c933f1643e56d9099f442975e47d764959033a18 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Fri, 8 Nov 2024 16:30:59 +0100
Subject: [PATCH 11/21] style and fixed

---
 src/together/utils/files.py     | 16 ++++++++--------
 tests/unit/test_files_checks.py |  9 ++++++---
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index c382d3bb..95f2fab1 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -236,6 +236,14 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
 
             report_dict["load_json"] = True
 
+        except InvalidFileFormatError as e:
+            report_dict["load_json"] = False
+            report_dict["is_check_passed"] = False
+            report_dict["message"] = e.message
+            if e.line_number is not None:
+                report_dict["line_number"] = e.line_number
+            if e.error_source is not None:
+                report_dict[e.error_source] = False
         except ValueError:
             report_dict["load_json"] = False
             if idx < 0:
@@ -248,14 +256,6 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                     f"Error parsing json payload. Unexpected format on line {idx + 1}."
                 )
             report_dict["is_check_passed"] = False
-        except InvalidFileFormatError as e:
-            report_dict["load_json"] = False
-            report_dict["is_check_passed"] = False
-            report_dict["message"] = e.message
-            if e.line_number is not None:
-                report_dict["line_number"] = e.line_number
-            if e.error_source is not None:
-                report_dict[e.error_source] = False
 
     if "text_field" not in report_dict:
         report_dict["text_field"] = True
diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py
index 0412be64..dcabb6e3 100644
--- a/tests/unit/test_files_checks.py
+++ b/tests/unit/test_files_checks.py
@@ -26,7 +26,10 @@ def test_check_jsonl_valid_instruction(tmp_path: Path):
     file = tmp_path / "valid_instruction.jsonl"
     content = [
         {"prompt": "Translate the following sentence.", "completion": "Hello, world!"},
-        {"prompt": "Summarize the text.", "completion": "Weyland-Yutani Corporation creates advanced AI."},
+        {
+            "prompt": "Summarize the text.",
+            "completion": "Weyland-Yutani Corporation creates advanced AI.",
+        },
     ]
     with file.open("w") as f:
         f.write("\n".join(json.dumps(item) for item in content))
@@ -187,7 +190,7 @@ def test_check_jsonl_invalid_role(tmp_path: Path):
     report = check_file(file)
 
     assert not report["is_check_passed"]
-    assert "Found invalid role 'invalid_role'" in report["message"]
+    assert "Found invalid role `invalid_role`" in report["message"]
 
 
 def test_check_jsonl_non_alternating_roles(tmp_path: Path):
@@ -237,4 +240,4 @@ def test_check_jsonl_missing_field_in_conversation(tmp_path: Path):
 
     report = check_file(file)
     assert not report["is_check_passed"]
-    assert "Field 'content' is missing for a turn" in report["message"]
+    assert "Field `content` is missing for a turn" in report["message"]

From 642333044e9eacd6e047c802530d27e7f55cc7bf Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Tue, 12 Nov 2024 17:10:49 +0100
Subject: [PATCH 12/21] pr feedback

---
 src/together/cli/api/finetune.py   |  6 ++++--
 src/together/resources/finetune.py | 11 ++++++++---
 src/together/utils/files.py        |  2 +-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
index 4d4896b2..1983f2b6 100644
--- a/src/together/cli/api/finetune.py
+++ b/src/together/cli/api/finetune.py
@@ -12,7 +12,7 @@
 
 from together import Together
 from together.cli.api.utils import BOOL_WITH_AUTO, INT_WITH_MAX
-from together.utils import finetune_price_to_dollars, log_warn, parse_timestamp
+from together.utils import finetune_price_to_dollars, log_warn, log_warn_once, parse_timestamp
 from together.types.finetune import DownloadCheckpointType, FinetuneTrainingLimits
 
 
@@ -97,7 +97,8 @@ def fine_tuning(ctx: click.Context) -> None:
     "--train-on-inputs",
     type=BOOL_WITH_AUTO,
     default="auto",
-    help="Whether to mask the user messages in conversational data or prompts in instruction data",
+    help="Whether to mask the user messages in conversational data or prompts in instruction data. "
+    "`auto` will automatically determine whether to mask the inputs based on the data format.",
 )
 def create(
     ctx: click.Context,
@@ -148,6 +149,7 @@ def create(
     )
 
     if lora:
+        log_warn_once("LoRA rank default has been changed from 8 to 64 as the maximum available for each model.")
         if model_limits.lora_training is None:
             raise click.BadParameter(
                 f"LoRA fine-tuning is not supported for the model `{model}`"
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
index 3c17c6ad..79596dd2 100644
--- a/src/together/resources/finetune.py
+++ b/src/together/resources/finetune.py
@@ -159,9 +159,9 @@ def create(
                 Defaults to None.
             train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data.
                 "auto" will automatically determine whether to mask the inputs based on the data format.
-                Dataset with "text" (General format) field will not mask the inputs by default.
-                Dataset with "messages" (Conversational format) or "prompt" and "completion" (Instruction format)
-                fields will mask the inputs by default.
+                For datasets with the "text" field (general format), inputs will not be masked.
+                For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields
+                (Instruction format), inputs will be masked.
                 Defaults to "auto".
 
         Returns:
@@ -477,6 +477,11 @@ async def create(
             model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning.
                 Defaults to None.
             train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data.
+                "auto" will automatically determine whether to mask the inputs based on the data format.
+                For datasets with the "text" field (general format), inputs will not be masked.
+                For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields
+                (Instruction format), inputs will be masked.
+                Defaults to "auto".
 
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index 95f2fab1..360a0bce 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -162,7 +162,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                             error_source="key_value",
                         )
 
-                    previous_role = ""
+                    previous_role = None
                     for turn in json_line[message_column]:
                         for column in REQUIRED_COLUMNS_MESSAGE:
                             if column not in turn:

From 85c51c4f90efa35eecd33a848c5b61b575348fcf Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Tue, 12 Nov 2024 17:13:33 +0100
Subject: [PATCH 13/21] style

---
 src/together/cli/api/finetune.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
index 1983f2b6..74f6a483 100644
--- a/src/together/cli/api/finetune.py
+++ b/src/together/cli/api/finetune.py
@@ -12,7 +12,12 @@
 
 from together import Together
 from together.cli.api.utils import BOOL_WITH_AUTO, INT_WITH_MAX
-from together.utils import finetune_price_to_dollars, log_warn, log_warn_once, parse_timestamp
+from together.utils import (
+    finetune_price_to_dollars,
+    log_warn,
+    log_warn_once,
+    parse_timestamp,
+)
 from together.types.finetune import DownloadCheckpointType, FinetuneTrainingLimits
 
 
@@ -149,7 +154,9 @@ def create(
     )
 
     if lora:
-        log_warn_once("LoRA rank default has been changed from 8 to 64 as the maximum available for each model.")
+        log_warn_once(
+            "LoRA rank default has been changed from 8 to 64 as the maximum available for each model."
+        )
         if model_limits.lora_training is None:
             raise click.BadParameter(
                 f"LoRA fine-tuning is not supported for the model `{model}`"

From 6268151a65d564e833ebb66a04304d91d88d9df2 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Wed, 13 Nov 2024 11:19:34 +0100
Subject: [PATCH 14/21] style

---
 src/together/utils/files.py     |  9 +++++++++
 tests/unit/test_files_checks.py | 22 ++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/src/together/utils/files.py b/src/together/utils/files.py
index 360a0bce..570aa508 100644
--- a/src/together/utils/files.py
+++ b/src/together/utils/files.py
@@ -162,6 +162,15 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
                             error_source="key_value",
                         )
 
+                    for turn_id, turn in enumerate(json_line[message_column]):
+                        if not isinstance(turn, dict):
+                            raise InvalidFileFormatError(
+                                message=f"Invalid format on line {idx + 1} of the input file. "
+                                f"Expected a dictionary in the {turn_id + 1} turn. Found {type(turn)}",
+                                line_number=idx + 1,
+                                error_source="key_value",
+                            )
+
                     previous_role = None
                     for turn in json_line[message_column]:
                         for column in REQUIRED_COLUMNS_MESSAGE:
diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py
index dcabb6e3..e79f1986 100644
--- a/tests/unit/test_files_checks.py
+++ b/tests/unit/test_files_checks.py
@@ -241,3 +241,25 @@ def test_check_jsonl_missing_field_in_conversation(tmp_path: Path):
     report = check_file(file)
     assert not report["is_check_passed"]
     assert "Field `content` is missing for a turn" in report["message"]
+
+
+def test_check_jsonl_wrong_turn_type(tmp_path: Path):
+    file = tmp_path / "wrong_turn_type.jsonl"
+    content = [
+        {
+            "messages": [
+                "Hi!",
+                {"role": "user", "content": "Hi"},
+                {"role": "assistant"},
+            ]
+        }
+    ]
+    with file.open("w") as f:
+        f.write("\n".join(json.dumps(item) for item in content))
+
+    report = check_file(file)
+    assert not report["is_check_passed"]
+    assert (
+        "Invalid format on line 1 of the input file. Expected a dictionary"
+        in report["message"]
+    )

From 6f3f8e3e39c2c504cbcb015be935fde986dc6ffc Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Wed, 13 Nov 2024 11:33:24 +0100
Subject: [PATCH 15/21] fix typing

---
 src/together/resources/finetune.py | 4 +++-
 src/together/types/finetune.py     | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
index 79596dd2..841b5913 100644
--- a/src/together/resources/finetune.py
+++ b/src/together/resources/finetune.py
@@ -83,6 +83,8 @@ def createFinetuneRequest(
     if warmup_ratio > 1 or warmup_ratio < 0:
         raise ValueError("Warmup ratio should be between 0 and 1")
 
+    train_on_inputs_bool = train_on_inputs if train_on_inputs != "auto" else None
+
     finetune_request = FinetuneRequest(
         model=model,
         training_file=training_file,
@@ -96,7 +98,7 @@ def createFinetuneRequest(
         training_type=training_type,
         suffix=suffix,
         wandb_key=wandb_api_key,
-        train_on_inputs=train_on_inputs,
+        train_on_inputs=train_on_inputs_bool,
     )
 
     return finetune_request
diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
index 917a5143..e7733458 100644
--- a/src/together/types/finetune.py
+++ b/src/together/types/finetune.py
@@ -163,7 +163,7 @@ class FinetuneRequest(BaseModel):
     # weights & biases api key
     wandb_key: str | None = None
     training_type: FullTrainingType | LoRATrainingType | None = None
-    train_on_inputs: bool | Literal["auto"] = "auto"
+    train_on_inputs: bool | None = None
 
 
 class FinetuneResponse(BaseModel):
@@ -231,7 +231,7 @@ class FinetuneResponse(BaseModel):
     # training file metadata
     training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines")
     training_file_size: int | None = Field(None, alias="TrainingFileSize")
-    train_on_inputs: bool | Literal["auto"] = "auto"
+    train_on_inputs: bool | None = None
 
     @field_validator("training_type")
     @classmethod

From 567abdad071f7b231e2f8d8763baa2ea668b7c97 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Wed, 13 Nov 2024 16:46:18 +0100
Subject: [PATCH 16/21] change to strict boolean

---
 src/together/types/finetune.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
index e7733458..7ca28486 100644
--- a/src/together/types/finetune.py
+++ b/src/together/types/finetune.py
@@ -3,7 +3,7 @@
 from enum import Enum
 from typing import List, Literal
 
-from pydantic import Field, validator, field_validator
+from pydantic import StrictBool, Field, validator, field_validator
 
 from together.types.abstract import BaseModel
 from together.types.common import (
@@ -163,7 +163,7 @@ class FinetuneRequest(BaseModel):
     # weights & biases api key
     wandb_key: str | None = None
     training_type: FullTrainingType | LoRATrainingType | None = None
-    train_on_inputs: bool | None = None
+    train_on_inputs: StrictBool | None = None
 
 
 class FinetuneResponse(BaseModel):

From 268ca77ea13d92610bd10896a8b42324c7f59b27 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Wed, 13 Nov 2024 16:47:28 +0100
Subject: [PATCH 17/21] error out on train_on_inputs

---
 src/together/resources/finetune.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
index 841b5913..b686f6c9 100644
--- a/src/together/resources/finetune.py
+++ b/src/together/resources/finetune.py
@@ -83,6 +83,9 @@ def createFinetuneRequest(
     if warmup_ratio > 1 or warmup_ratio < 0:
         raise ValueError("Warmup ratio should be between 0 and 1")
 
+    if train_on_inputs is None:
+        raise ValueError("train_on_inputs cannot be None")
+
     train_on_inputs_bool = train_on_inputs if train_on_inputs != "auto" else None
 
     finetune_request = FinetuneRequest(

From f8c6166dc5a8693608ae5fc577b2dfe489c699d4 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Thu, 14 Nov 2024 17:37:49 +0100
Subject: [PATCH 18/21] use "auto" directly

---
 src/together/cli/api/finetune.py   | 7 ++++---
 src/together/resources/finetune.py | 7 +------
 src/together/types/finetune.py     | 4 ++--
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
index 74f6a483..bd9c8514 100644
--- a/src/together/cli/api/finetune.py
+++ b/src/together/cli/api/finetune.py
@@ -154,9 +154,6 @@ def create(
     )
 
     if lora:
-        log_warn_once(
-            "LoRA rank default has been changed from 8 to 64 as the maximum available for each model."
-        )
         if model_limits.lora_training is None:
             raise click.BadParameter(
                 f"LoRA fine-tuning is not supported for the model `{model}`"
@@ -167,6 +164,10 @@ def create(
             "batch_size": model_limits.lora_training.max_batch_size,
             "learning_rate": 1e-3,
         }
+        log_warn_once(
+            f"LoRA rank default has been changed to {default_values['lora_r']} as the max available for the model.\n"
+            f"Learning rate default for LoRA FT has been changed to {default_values['learning_rate']}."
+        )
         for arg in default_values:
             arg_source = ctx.get_parameter_source("arg")  # type: ignore[attr-defined]
             if arg_source == ParameterSource.DEFAULT:
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
index b686f6c9..79596dd2 100644
--- a/src/together/resources/finetune.py
+++ b/src/together/resources/finetune.py
@@ -83,11 +83,6 @@ def createFinetuneRequest(
     if warmup_ratio > 1 or warmup_ratio < 0:
         raise ValueError("Warmup ratio should be between 0 and 1")
 
-    if train_on_inputs is None:
-        raise ValueError("train_on_inputs cannot be None")
-
-    train_on_inputs_bool = train_on_inputs if train_on_inputs != "auto" else None
-
     finetune_request = FinetuneRequest(
         model=model,
         training_file=training_file,
@@ -101,7 +96,7 @@ def createFinetuneRequest(
         training_type=training_type,
         suffix=suffix,
         wandb_key=wandb_api_key,
-        train_on_inputs=train_on_inputs_bool,
+        train_on_inputs=train_on_inputs,
     )
 
     return finetune_request
diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
index 7ca28486..f1fabb04 100644
--- a/src/together/types/finetune.py
+++ b/src/together/types/finetune.py
@@ -163,7 +163,7 @@ class FinetuneRequest(BaseModel):
     # weights & biases api key
     wandb_key: str | None = None
     training_type: FullTrainingType | LoRATrainingType | None = None
-    train_on_inputs: StrictBool | None = None
+    train_on_inputs: StrictBool | Literal["auto"] = "auto"
 
 
 class FinetuneResponse(BaseModel):
@@ -231,7 +231,7 @@ class FinetuneResponse(BaseModel):
     # training file metadata
     training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines")
     training_file_size: int | None = Field(None, alias="TrainingFileSize")
-    train_on_inputs: bool | None = None
+    train_on_inputs: StrictBool | Literal["auto"] | None = "auto"
 
     @field_validator("training_type")
     @classmethod

From b1f3a17de245c6b716cf455786a2ce9a1997674d Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Thu, 14 Nov 2024 17:40:29 +0100
Subject: [PATCH 19/21] add system message

---
 tests/unit/test_files_checks.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py
index e79f1986..65f59f61 100644
--- a/tests/unit/test_files_checks.py
+++ b/tests/unit/test_files_checks.py
@@ -58,6 +58,13 @@ def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path):
                 {"role": "assistant", "content": "I am fine."},
             ]
         },
+        {
+            "messages": [
+                {"role": "system", "content": "You are a kind AI"},
+                {"role": "user", "content": "How are you?"},
+                {"role": "assistant", "content": "I am fine."},
+            ]
+        },
     ]
     with file.open("w") as f:
         f.write("\n".join(json.dumps(item) for item in content))
@@ -93,6 +100,15 @@ def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path):
                 {"role": "assistant", "content": "It is cloudy with a chance of snow."},
             ]
         },
+        {
+            "messages": [
+                {"role": "system", "content": "You are a kind AI"},
+                {"role": "user", "content": "Who won the game last night?"},
+                {"role": "assistant", "content": "The home team won by two points."},
+                {"role": "user", "content": "What is the weather like in Amsterdam?"},
+                {"role": "assistant", "content": "It is cloudy with a chance of snow."},
+            ]
+        },
     ]
     with file.open("w") as f:
         f.write("\n".join(json.dumps(item) for item in content))

From 387a23ba94faf23c5adbe7c8d89d31a9790ead71 Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Thu, 14 Nov 2024 17:42:09 +0100
Subject: [PATCH 20/21] version bump

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a29fba8b..4c475deb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "together"
-version = "1.3.3"
+version = "1.3.4"
 authors = [
     "Together AI <support@together.ai>"
 ]

From 34d9177eff65987f38d07cf85ae739a569cf804c Mon Sep 17 00:00:00 2001
From: Artem Chumachenko <artek.chumak@gmail.com>
Date: Thu, 14 Nov 2024 18:03:48 +0100
Subject: [PATCH 21/21] Update src/together/cli/api/finetune.py

Co-authored-by: Max Ryabinin <mryabinin0@gmail.com>
---
 src/together/cli/api/finetune.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
index bd9c8514..bd509e60 100644
--- a/src/together/cli/api/finetune.py
+++ b/src/together/cli/api/finetune.py
@@ -165,8 +165,8 @@ def create(
             "learning_rate": 1e-3,
         }
         log_warn_once(
-            f"LoRA rank default has been changed to {default_values['lora_r']} as the max available for the model.\n"
-            f"Learning rate default for LoRA FT has been changed to {default_values['learning_rate']}."
+            f"The default LoRA rank for {model} has been changed to {default_values['lora_r']} as the max available.\n"
+            f"Also, the default learning rate for LoRA fine-tuning has been changed to {default_values['learning_rate']}."
         )
         for arg in default_values:
             arg_source = ctx.get_parameter_source("arg")  # type: ignore[attr-defined]