diff --git a/pyproject.toml b/pyproject.toml index a29fba8b..4c475deb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.3.3" +version = "1.3.4" authors = [ "Together AI " ] diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index 3fdbd74b..bd509e60 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -11,8 +11,13 @@ from tabulate import tabulate from together import Together -from together.cli.api.utils import INT_WITH_MAX -from together.utils import finetune_price_to_dollars, log_warn, parse_timestamp +from together.cli.api.utils import BOOL_WITH_AUTO, INT_WITH_MAX +from together.utils import ( + finetune_price_to_dollars, + log_warn, + log_warn_once, + parse_timestamp, +) from together.types.finetune import DownloadCheckpointType, FinetuneTrainingLimits @@ -93,6 +98,13 @@ def fine_tuning(ctx: click.Context) -> None: default=False, help="Whether to skip the launch confirmation message", ) +@click.option( + "--train-on-inputs", + type=BOOL_WITH_AUTO, + default="auto", + help="Whether to mask the user messages in conversational data or prompts in instruction data. " + "`auto` will automatically determine whether to mask the inputs based on the data format.", +) def create( ctx: click.Context, training_file: str, @@ -112,6 +124,7 @@ def create( suffix: str, wandb_api_key: str, confirm: bool, + train_on_inputs: bool | Literal["auto"], ) -> None: """Start fine-tuning""" client: Together = ctx.obj @@ -133,6 +146,7 @@ def create( lora_trainable_modules=lora_trainable_modules, suffix=suffix, wandb_api_key=wandb_api_key, + train_on_inputs=train_on_inputs, ) model_limits: FinetuneTrainingLimits = client.fine_tuning.get_model_limits( @@ -150,6 +164,10 @@ def create( "batch_size": model_limits.lora_training.max_batch_size, "learning_rate": 1e-3, } + log_warn_once( + f"The default LoRA rank for {model} has been changed to {default_values['lora_r']} as the max available.\n" + f"Also, the default learning rate for LoRA fine-tuning has been changed to {default_values['learning_rate']}." + ) for arg in default_values: arg_source = ctx.get_parameter_source("arg") # type: ignore[attr-defined] if arg_source == ParameterSource.DEFAULT: @@ -186,22 +204,7 @@ def create( if confirm or click.confirm(_CONFIRMATION_MESSAGE, default=True, show_default=True): response = client.fine_tuning.create( - training_file=training_file, - model=model, - n_epochs=n_epochs, - validation_file=validation_file, - n_evals=n_evals, - n_checkpoints=n_checkpoints, - batch_size=batch_size, - learning_rate=learning_rate, - warmup_ratio=warmup_ratio, - lora=lora, - lora_r=lora_r, - lora_dropout=lora_dropout, - lora_alpha=lora_alpha, - lora_trainable_modules=lora_trainable_modules, - suffix=suffix, - wandb_api_key=wandb_api_key, + **training_args, verbose=True, ) diff --git a/src/together/cli/api/utils.py b/src/together/cli/api/utils.py index 3f85f380..08dfe492 100644 --- a/src/together/cli/api/utils.py +++ b/src/together/cli/api/utils.py @@ -27,4 +27,25 @@ def convert( ) +class BooleanWithAutoParamType(click.ParamType): + name = "boolean_or_auto" + + def convert( + self, value: str, param: click.Parameter | None, ctx: click.Context | None + ) -> bool | Literal["auto"] | None: + if value == "auto": + return "auto" + try: + return bool(value) + except ValueError: + self.fail( + _("{value!r} is not a valid {type}.").format( + value=value, type=self.name + ), + param, + ctx, + ) + + INT_WITH_MAX = AutoIntParamType() +BOOL_WITH_AUTO = BooleanWithAutoParamType() diff --git a/src/together/constants.py b/src/together/constants.py index b4c9cf3b..c64af326 100644 --- a/src/together/constants.py +++ b/src/together/constants.py @@ -1,3 +1,5 @@ +import enum + # Session constants TIMEOUT_SECS = 600 MAX_SESSION_LIFETIME_SECS = 180 @@ -29,3 +31,20 @@ # expected columns for Parquet files PARQUET_EXPECTED_COLUMNS = ["input_ids", "attention_mask", "labels"] + + +class DatasetFormat(enum.Enum): + """Dataset format enum.""" + + GENERAL = "general" + CONVERSATION = "conversation" + INSTRUCTION = "instruction" + + +JSONL_REQUIRED_COLUMNS_MAP = { + DatasetFormat.GENERAL: ["text"], + DatasetFormat.CONVERSATION: ["messages"], + DatasetFormat.INSTRUCTION: ["prompt", "completion"], +} +REQUIRED_COLUMNS_MESSAGE = ["role", "content"] +POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"] diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 44d74f2b..79596dd2 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -43,6 +43,7 @@ def createFinetuneRequest( lora_trainable_modules: str | None = "all-linear", suffix: str | None = None, wandb_api_key: str | None = None, + train_on_inputs: bool | Literal["auto"] = "auto", ) -> FinetuneRequest: if batch_size == "max": log_warn_once( @@ -95,6 +96,7 @@ def createFinetuneRequest( training_type=training_type, suffix=suffix, wandb_key=wandb_api_key, + train_on_inputs=train_on_inputs, ) return finetune_request @@ -125,6 +127,7 @@ def create( wandb_api_key: str | None = None, verbose: bool = False, model_limits: FinetuneTrainingLimits | None = None, + train_on_inputs: bool | Literal["auto"] = "auto", ) -> FinetuneResponse: """ Method to initiate a fine-tuning job @@ -137,7 +140,7 @@ def create( n_evals (int, optional): Number of evaluation loops to run. Defaults to 0. n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning. Defaults to 1. - batch_size (int, optional): Batch size for fine-tuning. Defaults to max. + batch_size (int or "max"): Batch size for fine-tuning. Defaults to max. learning_rate (float, optional): Learning rate multiplier to use for training Defaults to 0.00001. warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. @@ -154,6 +157,12 @@ def create( Defaults to False. model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning. Defaults to None. + train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data. + "auto" will automatically determine whether to mask the inputs based on the data format. + For datasets with the "text" field (general format), inputs will not be masked. + For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields + (Instruction format), inputs will be masked. + Defaults to "auto". Returns: FinetuneResponse: Object containing information about fine-tuning job. @@ -184,6 +193,7 @@ def create( lora_trainable_modules=lora_trainable_modules, suffix=suffix, wandb_api_key=wandb_api_key, + train_on_inputs=train_on_inputs, ) if verbose: @@ -436,6 +446,7 @@ async def create( wandb_api_key: str | None = None, verbose: bool = False, model_limits: FinetuneTrainingLimits | None = None, + train_on_inputs: bool | Literal["auto"] = "auto", ) -> FinetuneResponse: """ Async method to initiate a fine-tuning job @@ -465,6 +476,12 @@ async def create( Defaults to False. model_limits (FinetuneTrainingLimits, optional): Limits for the hyperparameters the model in Fine-tuning. Defaults to None. + train_on_inputs (bool or "auto"): Whether to mask the user messages in conversational data or prompts in instruction data. + "auto" will automatically determine whether to mask the inputs based on the data format. + For datasets with the "text" field (general format), inputs will not be masked. + For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields + (Instruction format), inputs will be masked. + Defaults to "auto". Returns: FinetuneResponse: Object containing information about fine-tuning job. @@ -495,6 +512,7 @@ async def create( lora_trainable_modules=lora_trainable_modules, suffix=suffix, wandb_api_key=wandb_api_key, + train_on_inputs=train_on_inputs, ) if verbose: diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index 2f76c446..f1fabb04 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -3,7 +3,7 @@ from enum import Enum from typing import List, Literal -from pydantic import Field, validator, field_validator +from pydantic import StrictBool, Field, validator, field_validator from together.types.abstract import BaseModel from together.types.common import ( @@ -163,6 +163,7 @@ class FinetuneRequest(BaseModel): # weights & biases api key wandb_key: str | None = None training_type: FullTrainingType | LoRATrainingType | None = None + train_on_inputs: StrictBool | Literal["auto"] = "auto" class FinetuneResponse(BaseModel): @@ -230,6 +231,7 @@ class FinetuneResponse(BaseModel): # training file metadata training_file_num_lines: int | None = Field(None, alias="TrainingFileNumLines") training_file_size: int | None = Field(None, alias="TrainingFileSize") + train_on_inputs: StrictBool | Literal["auto"] | None = "auto" @field_validator("training_type") @classmethod diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 875f4d84..570aa508 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -13,9 +13,28 @@ MIN_SAMPLES, NUM_BYTES_IN_GB, PARQUET_EXPECTED_COLUMNS, + JSONL_REQUIRED_COLUMNS_MAP, + REQUIRED_COLUMNS_MESSAGE, + POSSIBLE_ROLES_CONVERSATION, + DatasetFormat, ) +class InvalidFileFormatError(ValueError): + """Exception raised for invalid file formats during file checks.""" + + def __init__( + self, + message: str = "", + line_number: int | None = None, + error_source: str | None = None, + ) -> None: + super().__init__(message) + self.message = message + self.line_number = line_number + self.error_source = error_source + + def check_file( file: Path | str, ) -> Dict[str, Any]: @@ -31,7 +50,7 @@ def check_file( "line_type": None, "text_field": None, "key_value": None, - "min_samples": None, + "has_min_samples": None, "num_samples": None, "load_json": None, } @@ -88,43 +107,132 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["is_check_passed"] = False return report_dict + dataset_format = None with file.open() as f: - # idx must be instantiated so decode errors (e.g. file is a tar) or empty files are caught idx = -1 try: for idx, line in enumerate(f): - json_line = json.loads(line) # each line in jsonlines should be a json + json_line = json.loads(line) if not isinstance(json_line, dict): - report_dict["line_type"] = False - report_dict["message"] = ( - f"Error parsing file. Invalid format on line {idx + 1} of the input file. " - 'Example of valid json: {"text": "my sample string"}. ' + raise InvalidFileFormatError( + message=( + f"Error parsing file. Invalid format on line {idx + 1} of the input file. " + 'Example of valid json: {"text": "my sample string"}. ' + ), + line_number=idx + 1, + error_source="line_type", ) - report_dict["is_check_passed"] = False + current_format = None + for possible_format in JSONL_REQUIRED_COLUMNS_MAP: + if all( + column in json_line + for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format] + ): + if current_format is None: + current_format = possible_format + elif current_format != possible_format: + raise InvalidFileFormatError( + message="Found multiple dataset formats in the input file. " + f"Got {current_format} and {possible_format} on line {idx + 1}.", + line_number=idx + 1, + error_source="format", + ) - if "text" not in json_line.keys(): - report_dict["text_field"] = False - report_dict["message"] = ( - f"Missing 'text' field was found on line {idx + 1} of the the input file. " - "Expected format: {'text': 'my sample string'}. " + if current_format is None: + raise InvalidFileFormatError( + message=( + f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n" + f"{json_line.keys()}" + ), + line_number=idx + 1, + error_source="format", ) - report_dict["is_check_passed"] = False - else: - # check to make sure the value of the "text" key is a string - if not isinstance(json_line["text"], str): - report_dict["key_value"] = False - report_dict["message"] = ( - f'Invalid value type for "text" key on line {idx + 1}. ' - f'Expected string. Found {type(json_line["text"])}.' + + if current_format == DatasetFormat.CONVERSATION: + message_column = JSONL_REQUIRED_COLUMNS_MAP[ + DatasetFormat.CONVERSATION + ][0] + if not isinstance(json_line[message_column], list): + raise InvalidFileFormatError( + message=f"Invalid format on line {idx + 1} of the input file. " + f"Expected a list of messages. Found {type(json_line[message_column])}", + line_number=idx + 1, + error_source="key_value", ) - report_dict["is_check_passed"] = False + for turn_id, turn in enumerate(json_line[message_column]): + if not isinstance(turn, dict): + raise InvalidFileFormatError( + message=f"Invalid format on line {idx + 1} of the input file. " + f"Expected a dictionary in the {turn_id + 1} turn. Found {type(turn)}", + line_number=idx + 1, + error_source="key_value", + ) + + previous_role = None + for turn in json_line[message_column]: + for column in REQUIRED_COLUMNS_MESSAGE: + if column not in turn: + raise InvalidFileFormatError( + message=f"Field `{column}` is missing for a turn `{turn}` on line {idx + 1} " + "of the the input file.", + line_number=idx + 1, + error_source="key_value", + ) + else: + if not isinstance(turn[column], str): + raise InvalidFileFormatError( + message=f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` " + f"of the input file. Expected string. Found {type(turn[column])}", + line_number=idx + 1, + error_source="text_field", + ) + role = turn["role"] + + if role not in POSSIBLE_ROLES_CONVERSATION: + raise InvalidFileFormatError( + message=f"Found invalid role `{role}` in the messages on the line {idx + 1}. " + f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}", + line_number=idx + 1, + error_source="key_value", + ) + + if previous_role == role: + raise InvalidFileFormatError( + message=f"Invalid role turns on line {idx + 1} of the input file. " + "`user` and `assistant` roles must alternate user/assistant/user/assistant/...", + line_number=idx + 1, + error_source="key_value", + ) + + previous_role = role + + else: + for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]: + if not isinstance(json_line[column], str): + raise InvalidFileFormatError( + message=f'Invalid value type for "{column}" key on line {idx + 1}. ' + f"Expected string. Found {type(json_line[column])}.", + line_number=idx + 1, + error_source="key_value", + ) + + if dataset_format is None: + dataset_format = current_format + elif current_format is not None: + if current_format != dataset_format: + raise InvalidFileFormatError( + message="All samples in the dataset must have the same dataset format. " + f"Got {dataset_format} for the first line and {current_format} " + f"for the line {idx + 1}.", + line_number=idx + 1, + error_source="format", + ) - # make sure this is outside the for idx, line in enumerate(f): for loop if idx + 1 < MIN_SAMPLES: - report_dict["min_samples"] = False + report_dict["has_min_samples"] = False report_dict["message"] = ( f"Processing {file} resulted in only {idx + 1} samples. " f"Our minimum is {MIN_SAMPLES} samples. " @@ -132,10 +240,19 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["is_check_passed"] = False else: report_dict["num_samples"] = idx + 1 - report_dict["min_samples"] = True + report_dict["has_min_samples"] = True + report_dict["is_check_passed"] = True report_dict["load_json"] = True + except InvalidFileFormatError as e: + report_dict["load_json"] = False + report_dict["is_check_passed"] = False + report_dict["message"] = e.message + if e.line_number is not None: + report_dict["line_number"] = e.line_number + if e.error_source is not None: + report_dict[e.error_source] = False except ValueError: report_dict["load_json"] = False if idx < 0: @@ -190,7 +307,8 @@ def _check_parquet(file: Path) -> Dict[str, Any]: num_samples = len(table) if num_samples < MIN_SAMPLES: - report_dict["min_samples"] = ( + report_dict["has_min_samples"] = False + report_dict["message"] = ( f"Processing {file} resulted in only {num_samples} samples. " f"Our minimum is {MIN_SAMPLES} samples. " ) diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py new file mode 100644 index 00000000..65f59f61 --- /dev/null +++ b/tests/unit/test_files_checks.py @@ -0,0 +1,281 @@ +import json +import pytest +from pathlib import Path + +from together.constants import MIN_SAMPLES +from together.utils.files import check_file + + +def test_check_jsonl_valid_general(tmp_path: Path): + # Create a valid JSONL file + file = tmp_path / "valid.jsonl" + content = [{"text": "Hello, world!"}, {"text": "How are you?"}] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == len(content) + assert report["has_min_samples"] + + +def test_check_jsonl_valid_instruction(tmp_path: Path): + # Create a valid JSONL file with instruction format + file = tmp_path / "valid_instruction.jsonl" + content = [ + {"prompt": "Translate the following sentence.", "completion": "Hello, world!"}, + { + "prompt": "Summarize the text.", + "completion": "Weyland-Yutani Corporation creates advanced AI.", + }, + ] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == len(content) + assert report["has_min_samples"] + + +def test_check_jsonl_valid_conversational_single_turn(tmp_path: Path): + # Create a valid JSONL file with conversational format and 1 user-assistant turn pair + file = tmp_path / "valid_conversational_single_turn.jsonl" + content = [ + { + "messages": [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + ] + }, + { + "messages": [ + {"role": "user", "content": "How are you?"}, + {"role": "assistant", "content": "I am fine."}, + ] + }, + { + "messages": [ + {"role": "system", "content": "You are a kind AI"}, + {"role": "user", "content": "How are you?"}, + {"role": "assistant", "content": "I am fine."}, + ] + }, + ] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == len(content) + assert report["has_min_samples"] + + +def test_check_jsonl_valid_conversational_multiple_turns(tmp_path: Path): + # Create a valid JSONL file with conversational format and multiple user-assistant turn pairs + file = tmp_path / "valid_conversational_multiple_turns.jsonl" + content = [ + { + "messages": [ + {"role": "user", "content": "Is it going to rain today?"}, + { + "role": "assistant", + "content": "Yes, expect showers in the afternoon.", + }, + {"role": "user", "content": "What is the weather like in Tokyo?"}, + {"role": "assistant", "content": "It is sunny with a chance of rain."}, + ] + }, + { + "messages": [ + {"role": "user", "content": "Who won the game last night?"}, + {"role": "assistant", "content": "The home team won by two points."}, + {"role": "user", "content": "What is the weather like in Amsterdam?"}, + {"role": "assistant", "content": "It is cloudy with a chance of snow."}, + ] + }, + { + "messages": [ + {"role": "system", "content": "You are a kind AI"}, + {"role": "user", "content": "Who won the game last night?"}, + {"role": "assistant", "content": "The home team won by two points."}, + {"role": "user", "content": "What is the weather like in Amsterdam?"}, + {"role": "assistant", "content": "It is cloudy with a chance of snow."}, + ] + }, + ] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == len(content) + assert report["has_min_samples"] + + +def test_check_jsonl_empty_file(tmp_path: Path): + # Create an empty JSONL file + file = tmp_path / "empty.jsonl" + file.touch() + + report = check_file(file) + + assert not report["is_check_passed"] + assert report["message"] == "File is empty" + assert report["file_size"] == 0 + + +def test_check_jsonl_non_utf8(tmp_path: Path): + # Create a non-UTF-8 encoded JSONL file + file = tmp_path / "non_utf8.jsonl" + file.write_bytes(b"\xff\xfe\xfd") + + report = check_file(file) + + assert not report["is_check_passed"] + assert not report["utf8"] + assert "File is not UTF-8 encoded." in report["message"] + + +def test_check_jsonl_invalid_json(tmp_path: Path): + # Create a JSONL file with invalid JSON + file = tmp_path / "invalid_json.jsonl" + content = [{"text": "Hello, world!"}, "Invalid JSON Line"] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert not report["is_check_passed"] + assert "Error parsing file." in report["message"] + + +def test_check_jsonl_missing_required_field(tmp_path: Path): + # Create a JSONL file missing a required field + file = tmp_path / "missing_field.jsonl" + content = [ + {"prompt": "Translate the following sentence.", "completion": "Hello, world!"}, + {"prompt": "Summarize the text."}, + ] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert not report["is_check_passed"] + assert ( + "Error parsing file. Could not detect a format for the line 2" + in report["message"] + ) + + +def test_check_jsonl_inconsistent_dataset_format(tmp_path: Path): + # Create a JSONL file with inconsistent dataset formats + file = tmp_path / "inconsistent_format.jsonl" + content = [ + {"messages": [{"role": "user", "content": "Hi"}]}, + {"text": "How are you?"}, # Missing 'messages' + ] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert not report["is_check_passed"] + assert ( + "All samples in the dataset must have the same dataset format" + in report["message"] + ) + + +def test_check_jsonl_invalid_role(tmp_path: Path): + # Create a JSONL file with an invalid role + file = tmp_path / "invalid_role.jsonl" + content = [{"messages": [{"role": "invalid_role", "content": "Hi"}]}] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert not report["is_check_passed"] + assert "Found invalid role `invalid_role`" in report["message"] + + +def test_check_jsonl_non_alternating_roles(tmp_path: Path): + # Create a JSONL file with non-alternating user/assistant roles + file = tmp_path / "non_alternating_roles.jsonl" + content = [ + { + "messages": [ + {"role": "user", "content": "Hi"}, + {"role": "user", "content": "Hello again"}, + ] + } + ] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert not report["is_check_passed"] + assert "Invalid role turns" in report["message"] + + +def test_check_jsonl_invalid_value_type(tmp_path: Path): + # Create a JSONL file with an invalid value type + file = tmp_path / "invalid_value_type.jsonl" + content = [{"text": 123}] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + assert not report["is_check_passed"] + assert "Expected string" in report["message"] + + +def test_check_jsonl_missing_field_in_conversation(tmp_path: Path): + file = tmp_path / "missing_field_in_conversation.jsonl" + content = [ + { + "messages": [ + {"role": "user", "content": "Hi"}, + {"role": "assistant"}, + ] + } + ] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + assert not report["is_check_passed"] + assert "Field `content` is missing for a turn" in report["message"] + + +def test_check_jsonl_wrong_turn_type(tmp_path: Path): + file = tmp_path / "wrong_turn_type.jsonl" + content = [ + { + "messages": [ + "Hi!", + {"role": "user", "content": "Hi"}, + {"role": "assistant"}, + ] + } + ] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + assert not report["is_check_passed"] + assert ( + "Invalid format on line 1 of the input file. Expected a dictionary" + in report["message"] + )