diff --git a/CHANGELOG.md b/CHANGELOG.md index a36890909..3fa33d735 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Add support for model profiles by @rawwar in ([#358](https://github.com/opensearch-project/opensearch-py-ml/pull/358)) - Support for security default admin credential changes in 2.12.0 in ([#365](https://github.com/opensearch-project/opensearch-py-ml/pull/365)) - adding cross encoder models in the pre-trained traced list ([#378](https://github.com/opensearch-project/opensearch-py-ml/pull/378)) +- Add scripts to trace Question Answering Models (2024-03-21 09:51:56) by @faradawn ([#349](https://github.com/opensearch-project/opensearch-py-ml/pull/349)) ### Changed diff --git a/opensearch_py_ml/ml_models/__init__.py b/opensearch_py_ml/ml_models/__init__.py index 3ec96ebd5..3a05b0a71 100644 --- a/opensearch_py_ml/ml_models/__init__.py +++ b/opensearch_py_ml/ml_models/__init__.py @@ -6,6 +6,7 @@ # GitHub history for details. from .metrics_correlation.mcorr import MCorr +from .question_answering_model import QuestionAnsweringModel from .sentencetransformermodel import SentenceTransformerModel -__all__ = ["SentenceTransformerModel", "MCorr"] +__all__ = ["SentenceTransformerModel", "QuestionAnsweringModel", "MCorr"] diff --git a/opensearch_py_ml/ml_models/question_answering_model.py b/opensearch_py_ml/ml_models/question_answering_model.py new file mode 100644 index 000000000..26386350f --- /dev/null +++ b/opensearch_py_ml/ml_models/question_answering_model.py @@ -0,0 +1,502 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# for generating config +import json +import os +from pathlib import Path +from zipfile import ZipFile + +import requests + +# for torch +import torch +import transformers +from transformers import AutoModelForQuestionAnswering, AutoTokenizer + +from opensearch_py_ml.ml_commons.ml_common_utils import ( + _generate_model_content_hash_value, +) + +LICENSE_URL = "https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE" + + +class QuestionAnsweringModel: + """ + Class for tracing the QuestionAnswering model. + """ + + # distilbert-base-cased-distilled-squad + DEFAULT_MODEL_ID = "distilbert-base-cased-distilled-squad" + SYNTHETIC_QUERY_FOLDER = "synthetic_queries" + + def __init__( + self, + model_id: str = DEFAULT_MODEL_ID, + folder_path: str = None, + overwrite: bool = False, + ) -> None: + """ + Initiate a question answering model class object. The model id will be used to download + pretrained model from the hugging-face and served as the default name for model files, and the folder_path + will be the default location to store files generated in the following functions + + :param model_id: Optional, the huggingface mode id to download the model, + default model id: 'distilbert-base-cased-distilled-squad' + :type model_id: string + :param folder_path: Optional, the path of the folder to save output files, such as queries, pre-trained model, + after-trained custom model and configuration files. if None, default as "/model_files/" under the current + work directory + :type folder_path: string + :param overwrite: Optional, choose to overwrite the folder at folder path. Default as false. When training + different question answering models, it's recommended to give designated folder path every time. + Users can choose to overwrite = True to overwrite previous runs + :type overwrite: bool + :return: no return value expected + :rtype: None + """ + default_folder_path = os.path.join( + os.getcwd(), "question_answering_model_files" + ) + + if folder_path is None: + self.folder_path = default_folder_path + else: + self.folder_path = folder_path + + # Check if self.folder_path exists + if os.path.exists(self.folder_path) and not overwrite: + print( + "To prevent overwriting, please enter a different folder path or delete the folder or enable " + "overwrite = True " + ) + raise Exception( + str("The default folder path already exists at : " + self.folder_path) + ) + + self.model_id = model_id + self.torch_script_zip_file_path = None + self.onnx_zip_file_path = None + + def _add_apache_license_to_model_zip_file(self, model_zip_file_path: str): + """ + Add Apache-2.0 license file to the model zip file at model_zip_file_path + + :param model_zip_file_path: + Path to the model zip file + :type model_zip_file_path: string + :return: no return value expected + :rtype: None + """ + r = requests.get(LICENSE_URL) + assert r.status_code == 200, "Failed to add license file to the model zip file" + + with ZipFile(str(model_zip_file_path), "a") as zipObj: + zipObj.writestr("LICENSE", r.content) + + def save_as_pt( + self, + sentences: [str] = ["today is sunny"], + model_id="distilbert-base-cased-distilled-squad", + model_name: str = None, + save_json_folder_path: str = None, + model_output_path: str = None, + zip_file_name: str = None, + add_apache_license: bool = False, + ) -> str: + """ + Download the model directly from huggingface, convert model to torch script format, + zip the model file and its tokenizer.json file to prepare to upload to the OpenSearch cluster + + :param sentences: + Optional, for example sentences = ['today is sunny'] + :type sentences: List of string [str] + :param model_id: + Optional, question answering model id to download model from Huggingface. + default model_id = "distilbert-base-cased-distilled-squad" + :type model_id: string + :param model_name: + Optional, model name to name the model file, e.g, "sample_model.pt". If None, default takes the + model_id and add the extension with ".pt" + :type model_name: string + :param save_json_folder_path: + Optional, path to save model json file, e.g, "home/save_pre_trained_model_json/"). If None, default as + default_folder_path from the constructor + :type save_json_folder_path: string + :param model_output_path: + Optional, path to save traced model zip file. If None, default as + default_folder_path from the constructor + :type model_output_path: string + :param zip_file_name: + Optional, file name for zip file. e.g, "sample_model.zip". If None, default takes the model_id + and add the extension with ".zip" + :type zip_file_name: string + :param add_apache_license: + Optional, whether to add a Apache-2.0 license file to model zip file + :type add_apache_license: string + :return: model zip file path. The file path where the zip file is being saved + :rtype: string + """ + + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = AutoModelForQuestionAnswering.from_pretrained(model_id) + + if model_name is None: + model_name = str(model_id.split("/")[-1] + ".pt") + + model_path = os.path.join(self.folder_path, model_name) + + if save_json_folder_path is None: + save_json_folder_path = self.folder_path + + if model_output_path is None: + model_output_path = self.folder_path + + if zip_file_name is None: + zip_file_name = str(model_id.split("/")[-1] + ".zip") + zip_file_path = os.path.join(model_output_path, zip_file_name) + + tokenizer.save_pretrained(save_json_folder_path) + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + # Open the tokenizer.json and replace the truncation field + with open(tokenizer_file_path) as user_file: + parsed_json = json.load(user_file) + + if "truncation" not in parsed_json or parsed_json["truncation"] is None: + parsed_json["truncation"] = { + "direction": "Right", + "max_length": tokenizer.model_max_length, + "strategy": "LongestFirst", + "stride": 0, + } + + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + with open(tokenizer_file_path, "w") as file: + json.dump(parsed_json, file, indent=2) + + # convert to pt format will need to be in cpu, + # set the device to cpu, convert its input_ids and attention_mask in cpu and save as .pt format + device = torch.device("cpu") + cpu_model = model.to(device) + features = tokenizer( + sentences, return_tensors="pt", padding=True, truncation=True + ).to(device) + + compiled_model = torch.jit.trace( + cpu_model, (features["input_ids"], features["attention_mask"]), strict=False + ) + torch.jit.save(compiled_model, model_path) + print("Traced torchscript model is saved to ", model_path) + + # zip model file along with tokenizer.json (and license file) as output + with ZipFile(str(zip_file_path), "w") as zipObj: + zipObj.write( + model_path, + arcname=str(model_name), + ) + zipObj.write( + os.path.join(save_json_folder_path, "tokenizer.json"), + arcname="tokenizer.json", + ) + if add_apache_license: + self._add_apache_license_to_model_zip_file(zip_file_path) + + self.torch_script_zip_file_path = zip_file_path + print("zip file is saved to ", zip_file_path, "\n") + return zip_file_path + + def save_as_onnx( + self, + model_id="distilbert-base-cased-distilled-squad", + model_name: str = None, + save_json_folder_path: str = None, + model_output_path: str = None, + zip_file_name: str = None, + add_apache_license: bool = False, + ) -> str: + """ + Download question answering model directly from huggingface, convert model to onnx format, + zip the model file and its tokenizer.json file to prepare to upload to the OpenSearch cluster + + :param model_id: + Optional, question answering model id to download model from Huggingface. + default model_id = "distilbert-base-cased-distilled-squad" + :type model_id: string + :param model_name: + Optional, model name to name the model file, e.g, "sample_model.pt". If None, default takes the + model_id and add the extension with ".pt" + :type model_name: string + :param save_json_folder_path: + Optional, path to save model json file, e.g, "home/save_pre_trained_model_json/"). If None, default as + default_folder_path from the constructor + :type save_json_folder_path: string + :param model_output_path: + Optional, path to save traced model zip file. If None, default as + default_folder_path from the constructor + :type model_output_path: string + :param zip_file_name: + Optional, file name for zip file. e.g, "sample_model.zip". If None, default takes the model_id + and add the extension with ".zip" + :type zip_file_name: string + :param add_apache_license: + Optional, whether to add a Apache-2.0 license file to model zip file + :type add_apache_license: string + :return: model zip file path. The file path where the zip file is being saved + :rtype: string + """ + + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = AutoModelForQuestionAnswering.from_pretrained(model_id) + + if model_name is None: + model_name = str(model_id.split("/")[-1] + ".onnx") + + model_path = os.path.join(self.folder_path, model_name) + + if save_json_folder_path is None: + save_json_folder_path = self.folder_path + + if model_output_path is None: + model_output_path = self.folder_path + + if zip_file_name is None: + zip_file_name = str(model_id.split("/")[-1] + ".zip") + zip_file_path = os.path.join(model_output_path, zip_file_name) + + # save tokenizer.json in save_json_folder_name + tokenizer.save_pretrained(save_json_folder_path) + + # Find the tokenizer.json file path in cache: /Users/faradawn/.cache/huggingface/hub/models/... + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + + # Open the tokenizer.json and replace the truncation field + with open(tokenizer_file_path) as user_file: + parsed_json = json.load(user_file) + + if "truncation" not in parsed_json or parsed_json["truncation"] is None: + parsed_json["truncation"] = { + "direction": "Right", + "max_length": tokenizer.model_max_length, + "strategy": "LongestFirst", + "stride": 0, + } + + # Save tokenizer + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + with open(tokenizer_file_path, "w") as file: + json.dump(parsed_json, file, indent=2) + + # load config + model_kind, model_onnx_config = ( + transformers.onnx.FeaturesManager.check_supported_model_or_raise( + model, feature="question-answering" + ) + ) + onnx_config = model_onnx_config(model.config) + + # export + onnx_inputs, onnx_outputs = transformers.onnx.export( + preprocessor=tokenizer, + model=model, + config=onnx_config, + opset=13, + output=Path(model_path), + ) + + print("Traced onnx model is saved to ", model_path) + + # zip model file along with tokenizer.json (and license file) as output + with ZipFile(str(zip_file_path), "w") as zipObj: + zipObj.write( + model_path, + arcname=str(model_name), + ) + zipObj.write( + os.path.join(save_json_folder_path, "tokenizer.json"), + arcname="tokenizer.json", + ) + if add_apache_license: + self._add_apache_license_to_model_zip_file(zip_file_path) + + self.onnx_zip_file_path = zip_file_path + print("zip file is saved to ", zip_file_path, "\n") + return zip_file_path + + def make_model_config_json( + self, + model_name: str = None, + version_number: str = 1, + model_format: str = "TORCH_SCRIPT", + model_zip_file_path: str = None, + embedding_dimension: int = None, + pooling_mode: str = None, + normalize_result: bool = None, + description: str = None, + all_config: str = None, + model_type: str = None, + verbose: bool = False, + ) -> str: + """ + Parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file. + If all required fields are given by users, use the given parameters and will skip reading the config.json + + :param model_name: + Optional, The name of the model. If None, default is model id, for example, + 'distilbert-base-cased-distilled-squad' + :type model_name: string + :param model_format: + Optional, the format of the model. Default is "TORCH_SCRIPT". + :type model_format: string + :param model_zip_file_path: + Optional, path to the model zip file. Default is the zip file path used in save_as_pt or save_as_onnx + depending on model_format. This zip file is used to compute model_content_size_in_bytes and + model_content_hash_value. + :type model_zip_file_path: string + :param version_number: + Optional, The version number of the model. Default is 1 + :type version_number: string + :param embedding_dimension: Optional, the embedding dimension of the model. If None, get embedding_dimension + from the pre-trained hugging-face model object. + :type embedding_dimension: int + :param pooling_mode: Optional, the pooling mode of the model. If None, get pooling_mode + from the pre-trained hugging-face model object. + :type pooling_mode: string + :param normalize_result: Optional, whether to normalize the result of the model. If None, check from the pre-trained + hugging-face model object. + :type normalize_result: bool + :param description: Optional, the description of the model. If None, use the default description. + file in the model folder. + :type description: str + :param all_config: + Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained + hugging-face model + :type all_config: dict + :param model_type: + Optional, the model_type of the model. If None, parse model_type from the config file of pre-trained + hugging-face model + :type model_type: string + :param verbose: + optional, use printing more logs. Default as false + :type verbose: bool + :return: model config file path. The file path where the model config file is being saved + :rtype: string + """ + folder_path = self.folder_path + config_json_file_path = os.path.join(folder_path, "config.json") + if model_name is None: + model_name = self.model_id + + # if user input model_type/embedding_dimension/pooling_mode, it will skip this step. + model = AutoModelForQuestionAnswering.from_pretrained(self.model_id) + model.save_pretrained(self.folder_path) + + # fill the empty fields + if ( + model_type is None + or embedding_dimension is None + or pooling_mode is None + or normalize_result is None + ): + try: + if embedding_dimension is None: + try: + embedding_dimension = model.config.dim + except Exception: + embedding_dimension = 768 + + if model_type is None: + model_type = "distilbert" + if pooling_mode is None: + pooling_mode = "CLS" + if normalize_result is None: + normalize_result = False + + except Exception as e: + raise Exception( + f"Raised exception while getting model data from pre-trained hugging-face model object: {e}" + ) + + # fill the description + if description is None: + description = "This is a question-answering model: it provides answers to a question and context." + + # dump the config.json file + if all_config is None: + if not os.path.exists(config_json_file_path): + raise Exception( + str( + "Cannot find config.json in" + + config_json_file_path + + ". Please check the config.son file in the path." + ) + ) + try: + with open(config_json_file_path) as f: + if verbose: + print("reading config file from: " + config_json_file_path) + config_content = json.load(f) + if all_config is None: + all_config = config_content + except IOError: + print( + "Cannot open in config.json file at ", + config_json_file_path, + ". Please check the config.json ", + "file in the path.", + ) + + model_config_content = { + "name": model_name, + "version": version_number, + "description": description, + "model_format": model_format, + "model_task_type": "QUESTION_ANSWERING", + "model_config": { + "model_type": model_type, + "embedding_dimension": embedding_dimension, + "framework_type": "sentence_transformers", + "pooling_mode": pooling_mode, + "normalize_result": normalize_result, + "all_config": json.dumps(all_config), + }, + } + + # get model size and hash value + if model_zip_file_path is None: + model_zip_file_path = ( + self.torch_script_zip_file_path + if model_format == "TORCH_SCRIPT" + else self.onnx_zip_file_path + ) + + if model_zip_file_path is None: + print( + "The model configuration JSON file currently lacks the 'model_content_size_in_bytes' and 'model_content_hash_value' fields. You can include these fields by specifying the 'model_zip_file_path' parameter. Failure to do so may result in the model registration process encountering issues." + ) + else: + model_config_content["model_content_size_in_bytes"] = os.stat( + model_zip_file_path + ).st_size + model_config_content["model_content_hash_value"] = ( + _generate_model_content_hash_value(model_zip_file_path) + ) + + if verbose: + print("generating ml-commons_model_config.json file...\n") + print(json.dumps(model_config_content, indent=4)) + + model_config_file_path = os.path.join( + folder_path, "ml-commons_model_config.json" + ) + os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True) + with open(model_config_file_path, "w") as file: + json.dump(model_config_content, file, indent=4) + print( + "ml-commons_model_config.json file is saved at : ", model_config_file_path + ) + + return model_config_file_path diff --git a/requirements-dev.txt b/requirements-dev.txt index e7b62bcf0..10d6cc96f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,6 +14,7 @@ transformers>=4.36.0,<5 deprecated>=1.2.14,<2 mdutils>=1.6.0,<2 pillow>10.0.0,<11 +onnxruntime # # Testing diff --git a/tests/ml_models/test_question_answering_pytest.py b/tests/ml_models/test_question_answering_pytest.py new file mode 100644 index 000000000..08b8c1d69 --- /dev/null +++ b/tests/ml_models/test_question_answering_pytest.py @@ -0,0 +1,530 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. +# How to run: pytest tests/ml_models/test_question_answering_pytest.py + +import json +import os +import shutil +from zipfile import ZipFile + +import pytest + +from opensearch_py_ml.ml_models import QuestionAnsweringModel + +# default parameters +default_model_id = "distilbert-base-cased-distilled-squad" +default_model_description = ( + "This is a question-answering model: it provides answers to a question and context." +) + +TEST_FOLDER = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests", "test_model_files" +) +TESTDATA_FILENAME = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests", "sample_zip.zip" +) +TESTDATA_UNZIP_FOLDER = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests", "sample_zip" +) + + +def clean_test_folder(TEST_FOLDER): + if os.path.exists(TEST_FOLDER): + for files in os.listdir(TEST_FOLDER): + sub_path = os.path.join(TEST_FOLDER, files) + if os.path.isfile(sub_path): + os.remove(sub_path) + else: + try: + shutil.rmtree(sub_path) + except OSError as err: + print( + "Fail to delete files, please delete all files in " + + str(TEST_FOLDER) + + " " + + str(err) + ) + + shutil.rmtree(TEST_FOLDER) + + +def compare_model_config( + model_config_path, + model_id, + model_format, + expected_model_description=None, + expected_model_config_data=None, +): + try: + with open(model_config_path) as json_file: + model_config_data = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" + + assert ( + "name" in model_config_data and model_config_data["name"] == model_id + ), f"Missing or Wrong model name in {model_format} model config file" + + assert ( + "model_format" in model_config_data + and model_config_data["model_format"] == model_format + ), f"Missing or Wrong model_format in {model_format} model config file" + + if expected_model_description is not None: + assert ( + "description" in model_config_data + and model_config_data["description"] == expected_model_description + ), f"Missing or Wrong model description in {model_format} model config file'" + + if expected_model_config_data is not None: + assert ( + "model_config" in model_config_data + ), f"Missing 'model_config' in {model_format} model config file" + + if expected_model_config_data is not None: + for k, v in expected_model_config_data.items(): + assert ( + k in model_config_data["model_config"] + and model_config_data["model_config"][k] == v + ) or ( + k not in model_config_data["model_config"] + and k == "normalize_result" + and not v + ) + + assert ( + "model_content_size_in_bytes" in model_config_data + ), f"Missing 'model_content_size_in_bytes' in {model_format} model config file" + + assert ( + "model_content_hash_value" in model_config_data + ), f"Missing 'model_content_hash_value' in {model_format} model config file" + + +def compare_model_zip_file(zip_file_path, expected_filenames, model_format): + with ZipFile(zip_file_path, "r") as f: + filenames = set(f.namelist()) + assert ( + filenames == expected_filenames + ), f"The content in the {model_format} model zip file does not match the expected content: {filenames} != {expected_filenames}" + + +# New +clean_test_folder(TEST_FOLDER) +test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER) + + +def test_check_attribute(): + test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER) + try: + check_attribute = getattr(test_model, "model_id", "folder_path") + except AttributeError: + check_attribute = False + assert check_attribute + + assert test_model.folder_path == TEST_FOLDER + assert test_model.model_id == default_model_id + + default_folder = os.path.join(os.getcwd(), "question_answering_model_files") + + clean_test_folder(default_folder) + test_model0 = QuestionAnsweringModel() + assert test_model0.folder_path == default_folder + clean_test_folder(default_folder) + + clean_test_folder(TEST_FOLDER) + our_model_id = "distilbert-base-cased-distilled-squad" + test_model1 = QuestionAnsweringModel(folder_path=TEST_FOLDER, model_id=our_model_id) + assert test_model1.model_id == our_model_id + + +def test_folder_path(): + with pytest.raises(Exception) as exc_info: + test_non_empty_path = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests" + ) + QuestionAnsweringModel(folder_path=test_non_empty_path, overwrite=False) + assert exc_info.type is Exception + assert "The default folder path already exists" in exc_info.value.args[0] + + +# New tests for save_as_pt and save_as_onnx + +test_cases = [ + {"question": "Who was Jim Henson?", "context": "Jim Henson was a nice puppet"}, + { + "question": "Where do I live?", + "context": "My name is Sarah and I live in London", + }, + { + "question": "What's my name?", + "context": "My name is Clara and I live in Berkeley.", + }, + { + "question": "Which name is also used to describe the Amazon rainforest in English?", + "context": "The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain 'Amazonas' in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.", + }, +] + + +def get_official_answer(test_cases): + # Obtain pytorch's official model + import torch + from transformers import AutoModelForQuestionAnswering, AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad") + official_model = AutoModelForQuestionAnswering.from_pretrained( + "distilbert-base-cased-distilled-squad" + ) + + results = [] + + for case in test_cases: + question, context = case["question"], case["context"] + inputs = tokenizer(question, context, return_tensors="pt") + with torch.no_grad(): + outputs = official_model(**inputs) + answer_start_index = torch.argmax(outputs.start_logits, dim=-1).item() + answer_end_index = torch.argmax(outputs.end_logits, dim=-1).item() + results.append([answer_start_index, answer_end_index]) + + return results + + +def get_pt_answer(test_cases, folder_path, model_id): + import torch + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model_id) + traced_model = torch.jit.load(f"{folder_path}/{model_id}.pt") + + results = [] + + for case in test_cases: + question, context = case["question"], case["context"] + inputs = tokenizer(question, context, return_tensors="pt") + + with torch.no_grad(): + outputs = traced_model(**inputs) + answer_start_index = torch.argmax(outputs["start_logits"], dim=-1).item() + answer_end_index = torch.argmax(outputs["end_logits"], dim=-1).item() + results.append([answer_start_index, answer_end_index]) + + return results + + +def get_onnx_answer(test_cases, folder_path, model_id): + import numpy as np + from onnxruntime import InferenceSession + from transformers import AutoTokenizer + + session = InferenceSession(f"{folder_path}/{model_id}.onnx") + tokenizer = AutoTokenizer.from_pretrained(model_id) + + results = [] + + for case in test_cases: + question, context = case["question"], case["context"] + inputs = tokenizer(question, context, return_tensors="pt") + + inputs = tokenizer(question, context, return_tensors="np") + outputs = session.run( + output_names=["start_logits", "end_logits"], input_feed=dict(inputs) + ) + + answer_start_index = np.argmax(outputs[0], axis=-1).item() + answer_end_index = np.argmax(outputs[1], axis=-1).item() + results.append([answer_start_index, answer_end_index]) + + return results + + +def test_pt_answer(): + test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER, overwrite=True) + test_model.save_as_pt(default_model_id) + pt_results = get_pt_answer(test_cases, TEST_FOLDER, default_model_id) + official_results = get_official_answer(test_cases) + for i in range(len(pt_results)): + assert ( + pt_results[i] == official_results[i] + ), f"Failed at index {i}: pt_results[{i}] ({pt_results[i]}) != official_results[{i}] ({official_results[i]})" + + clean_test_folder(TEST_FOLDER) + clean_test_folder(TESTDATA_UNZIP_FOLDER) + + +def test_onnx_answer(): + test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER, overwrite=True) + test_model.save_as_onnx(default_model_id) + onnx_results = get_onnx_answer(test_cases, TEST_FOLDER, default_model_id) + official_results = get_official_answer(test_cases) + for i in range(len(onnx_results)): + assert ( + onnx_results[i] == official_results[i] + ), f"Failed at index {i}: onnx_results[{i}] ({onnx_results[i]}) != official_results[{i}] ({official_results[i]})" + + clean_test_folder(TEST_FOLDER) + clean_test_folder(TESTDATA_UNZIP_FOLDER) + + +def test_make_model_config_json_for_torch_script(): + model_id = default_model_id + model_format = "TORCH_SCRIPT" + expected_model_description = default_model_description + expected_model_config_data = { + "embedding_dimension": 768, + "pooling_mode": "CLS", + "normalize_result": False, + } + + clean_test_folder(TEST_FOLDER) + test_model5 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model5.make_model_config_json( + model_format="TORCH_SCRIPT", verbose=True + ) + + compare_model_config( + model_config_path_torch, + model_id, + model_format, + expected_model_description=expected_model_description, + expected_model_config_data=expected_model_config_data, + ) + + clean_test_folder(TEST_FOLDER) + + +def test_make_model_config_json_for_onnx(): + model_id = default_model_id + model_format = "ONNX" + expected_model_description = default_model_description + expected_model_config_data = { + "embedding_dimension": 768, + "pooling_mode": "CLS", + "normalize_result": False, + } + + clean_test_folder(TEST_FOLDER) + test_model6 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model6.save_as_onnx(model_id=model_id) + model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX") + + compare_model_config( + model_config_path_onnx, + model_id, + model_format, + expected_model_description=expected_model_description, + expected_model_config_data=expected_model_config_data, + ) + + clean_test_folder(TEST_FOLDER) + + +def test_overwrite_fields_in_model_config(): + model_id = default_model_id + model_format = "TORCH_SCRIPT" + + overwritten_model_config_data = { + "embedding_dimension": 128, + "pooling_mode": "MAX", + "normalize_result": False, + } + + clean_test_folder(TEST_FOLDER) + test_model8 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model8.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model8.make_model_config_json( + model_format="TORCH_SCRIPT", + embedding_dimension=overwritten_model_config_data["embedding_dimension"], + pooling_mode=overwritten_model_config_data["pooling_mode"], + normalize_result=overwritten_model_config_data["normalize_result"], + ) + + compare_model_config( + model_config_path_torch, + model_id, + model_format, + expected_model_description=None, + expected_model_config_data=overwritten_model_config_data, + ) + + clean_test_folder(TEST_FOLDER) + + +def test_missing_expected_description_in_readme_file(): + model_id = default_model_id + model_format = "TORCH_SCRIPT" + expected_model_description = default_model_description + + clean_test_folder(TEST_FOLDER) + test_model10 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model10.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + temp_path = os.path.join( + TEST_FOLDER, + "README.md", + ) + with open(temp_path, "w") as f: + f.write("No model description here") + model_config_path_torch = test_model10.make_model_config_json( + model_format=model_format + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] == expected_model_description + ), "Should use default model description when description is missing from README.md" + + clean_test_folder(TEST_FOLDER) + + +def test_overwrite_description(): + model_id = default_model_id + model_format = "TORCH_SCRIPT" + expected_model_description = "Expected Description" + + clean_test_folder(TEST_FOLDER) + test_model11 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model11.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model11.make_model_config_json( + model_format=model_format, description=expected_model_description + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] == expected_model_description + ), "Cannot overwrite description in model config file" + + clean_test_folder(TEST_FOLDER) + + +def test_truncation_parameter(): + model_id = default_model_id + MAX_LENGTH_TASB = 512 + + clean_test_folder(TEST_FOLDER) + test_model13 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + + tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") + try: + with open(tokenizer_json_file_path, "r") as json_file: + tokenizer_json = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating tokenizer.json file for tracing raised an exception {exec}" + + assert tokenizer_json[ + "truncation" + ], "truncation parameter in tokenizer.json is null" + + assert ( + tokenizer_json["truncation"]["max_length"] == MAX_LENGTH_TASB + ), "max_length is not properly set" + + clean_test_folder(TEST_FOLDER) + + +def test_save_as_pt_with_license(): + model_id = "distilbert-base-cased-distilled-squad" + model_format = "TORCH_SCRIPT" + torch_script_zip_file_path = os.path.join( + TEST_FOLDER, "distilbert-base-cased-distilled-squad.zip" + ) + torch_script_expected_filenames = { + "distilbert-base-cased-distilled-squad.pt", + "tokenizer.json", + "LICENSE", + } + + clean_test_folder(TEST_FOLDER) + test_model15 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model15.save_as_pt( + model_id=model_id, + sentences=["today is sunny"], + add_apache_license=True, + ) + + compare_model_zip_file( + torch_script_zip_file_path, torch_script_expected_filenames, model_format + ) + + clean_test_folder(TEST_FOLDER) + + +def test_save_as_onnx_with_license(): + model_id = "distilbert-base-cased-distilled-squad" + model_format = "ONNX" + onnx_zip_file_path = os.path.join( + TEST_FOLDER, "distilbert-base-cased-distilled-squad.zip" + ) + onnx_expected_filenames = { + "distilbert-base-cased-distilled-squad.onnx", + "tokenizer.json", + "LICENSE", + } + + clean_test_folder(TEST_FOLDER) + test_model16 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model16.save_as_onnx(model_id=model_id, add_apache_license=True) + + compare_model_zip_file(onnx_zip_file_path, onnx_expected_filenames, model_format) + + clean_test_folder(TEST_FOLDER) + + +clean_test_folder(TEST_FOLDER) +clean_test_folder(TESTDATA_UNZIP_FOLDER) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index c9c9046ba..17f86c75b 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -463,7 +463,7 @@ def test_overwrite_description(): def test_long_description(): model_id = "sentence-transformers/gtr-t5-base" model_format = "TORCH_SCRIPT" - expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of sematic search." + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of semantic search." clean_test_folder(TEST_FOLDER) test_model12 = SentenceTransformerModel(