From 291db8066f437f3fab9e9baf96a78f3b525e4e78 Mon Sep 17 00:00:00 2001
From: Antoine Chaffin <38869395+NohTow@users.noreply.github.com>
Date: Mon, 25 Nov 2024 16:16:44 +0100
Subject: [PATCH] Creation of the PylateModelCard (#67)

* Creation of the PylateModelCard

* Fixing ruff top import

* Removing the example making tests fail

* Changing Sentence Transformer model default to PyLate

* Moving files to a dedicated subfolder

* Removing all the awfully copy-pasted redundant code to extend ST properly

* Adding init for hf_hub

* Changing docstring for automatic parsing documentation

* Consistency in the model args

* Adding save to tests to test saving/model card creation
---
 pylate/hf_hub/__init__.py            |   3 +
 pylate/hf_hub/model_card.py          | 259 +++++++++++++++++++++
 pylate/hf_hub/model_card_template.md | 329 +++++++++++++++++++++++++++
 pylate/models/colbert.py             |   8 +-
 tests/test_contrastive.py            |   2 +
 tests/test_kd.py                     |   2 +
 6 files changed, 598 insertions(+), 5 deletions(-)
 create mode 100644 pylate/hf_hub/__init__.py
 create mode 100644 pylate/hf_hub/model_card.py
 create mode 100644 pylate/hf_hub/model_card_template.md

diff --git a/pylate/hf_hub/__init__.py b/pylate/hf_hub/__init__.py
new file mode 100644
index 0000000..dae0b6a
--- /dev/null
+++ b/pylate/hf_hub/__init__.py
@@ -0,0 +1,3 @@
+from .model_card import PylateModelCardData
+
+__all__ = ["PylateModelCardData"]
diff --git a/pylate/hf_hub/model_card.py b/pylate/hf_hub/model_card.py
new file mode 100644
index 0000000..80cb330
--- /dev/null
+++ b/pylate/hf_hub/model_card.py
@@ -0,0 +1,259 @@
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from dataclasses import dataclass, field, fields
+from pathlib import Path
+from platform import python_version
+from typing import TYPE_CHECKING, Any, Literal
+
+import torch
+import transformers
+from huggingface_hub import ModelCard
+from sentence_transformers import SentenceTransformerModelCardData
+from sentence_transformers import __version__ as sentence_transformers_version
+from sentence_transformers.util import (
+    is_accelerate_available,
+    is_datasets_available,
+)
+from torch import nn
+from transformers.integrations import CodeCarbonCallback
+
+from ..__version__ import __version__ as pylate_version
+
+if is_datasets_available():
+    pass
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sentence_transformers.evaluation.SentenceEvaluator import SentenceEvaluator
+    from sentence_transformers.SentenceTransformer import SentenceTransformer
+    from sentence_transformers.trainer import SentenceTransformerTrainer
+
+
+IGNORED_FIELDS = ["model", "trainer", "eval_results_dict"]
+
+
+def get_versions() -> dict[str, Any]:
+    versions = {
+        "python": python_version(),
+        "sentence_transformers": sentence_transformers_version,
+        "transformers": transformers.__version__,
+        "torch": torch.__version__,
+        "pylate": pylate_version,
+    }
+    if is_accelerate_available():
+        from accelerate import __version__ as accelerate_version
+
+        versions["accelerate"] = accelerate_version
+    if is_datasets_available():
+        from datasets import __version__ as datasets_version
+
+        versions["datasets"] = datasets_version
+    from tokenizers import __version__ as tokenizers_version
+
+    versions["tokenizers"] = tokenizers_version
+    return versions
+
+
+@dataclass
+class PylateModelCardData(SentenceTransformerModelCardData):
+    """
+    A dataclass for storing data used in the model card.
+
+    Parameters
+    ----------
+    language
+        The model language, either a string or a list of strings, e.g., "en" or ["en", "de", "nl"].
+    license
+        The license of the model, e.g., "apache-2.0", "mit", or "cc-by-nc-sa-4.0".
+    model_name
+        The pretty name of the model, e.g., "SentenceTransformer based on microsoft/mpnet-base".
+    model_id
+        The model ID for pushing the model to the Hub, e.g., "tomaarsen/sbert-mpnet-base-allnli".
+    train_datasets
+        A list of dictionaries containing names and/or Hugging Face dataset IDs for training datasets,
+        e.g., [{"name": "SNLI", "id": "stanfordnlp/snli"}, {"name": "MultiNLI", "id": "nyu-mll/multi_nli"}, {"name": "STSB"}].
+    eval_datasets
+        A list of dictionaries containing names and/or Hugging Face dataset IDs for evaluation datasets,
+        e.g., [{"name": "SNLI", "id": "stanfordnlp/snli"}, {"id": "mteb/stsbenchmark-sts"}].
+    task_name
+        The human-readable task the model is trained on, e.g., "semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more".
+    tags
+        A list of tags for the model, e.g., ["sentence-transformers", "sentence-similarity", "feature-extraction"].
+    """
+
+    # Potentially provided by the user
+    language: str | list[str] | None = field(default_factory=list)
+    license: str | None = None
+    model_name: str | None = None
+    model_id: str | None = None
+    train_datasets: list[dict[str, str]] = field(default_factory=list)
+    eval_datasets: list[dict[str, str]] = field(default_factory=list)
+    task_name: str = "semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more"
+    tags: list[str] | None = field(
+        default_factory=lambda: [
+            "ColBERT",
+            "PyLate",
+            "sentence-transformers",
+            "sentence-similarity",
+            "feature-extraction",
+        ]
+    )
+    generate_widget_examples: Literal["deprecated"] = "deprecated"
+
+    # Automatically filled by `ModelCardCallback` and the Trainer directly
+    base_model: str | None = field(default=None, init=False)
+    base_model_revision: str | None = field(default=None, init=False)
+    non_default_hyperparameters: dict[str, Any] = field(
+        default_factory=dict, init=False
+    )
+    all_hyperparameters: dict[str, Any] = field(default_factory=dict, init=False)
+    eval_results_dict: dict[SentenceEvaluator, dict[str, Any]] | None = field(
+        default_factory=dict, init=False
+    )
+    training_logs: list[dict[str, float]] = field(default_factory=list, init=False)
+    widget: list[dict[str, str]] = field(default_factory=list, init=False)
+    predict_example: str | None = field(default=None, init=False)
+    label_example_list: list[dict[str, str]] = field(default_factory=list, init=False)
+    code_carbon_callback: CodeCarbonCallback | None = field(default=None, init=False)
+    citations: dict[str, str] = field(default_factory=dict, init=False)
+    best_model_step: int | None = field(default=None, init=False)
+    trainer: SentenceTransformerTrainer | None = field(
+        default=None, init=False, repr=False
+    )
+    datasets: list[str] = field(default_factory=list, init=False, repr=False)
+
+    # Utility fields
+    first_save: bool = field(default=True, init=False)
+    widget_step: int = field(default=-1, init=False)
+
+    # Computed once, always unchanged
+    pipeline_tag: str = field(default="sentence-similarity", init=False)
+    library_name: str = field(default="PyLate", init=False)
+    version: dict[str, str] = field(default_factory=get_versions, init=False)
+
+    # Passed via `register_model` only
+    model: SentenceTransformer | None = field(default=None, init=False, repr=False)
+
+    def set_losses(self, losses: list[nn.Module]) -> None:
+        citations = {
+            "Sentence Transformers": """
+@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084"
+}""",
+            "PyLate": """
+@misc{PyLate,
+title={PyLate: Flexible Training and Retrieval for Late Interaction Models},
+author={Chaffin, Antoine and Sourty, Raphaël},
+url={https://github.com/lightonai/pylate},
+year={2024}
+}""",
+        }
+        for loss in losses:
+            try:
+                citations[loss.__class__.__name__] = loss.citation
+            except Exception:
+                pass
+        inverted_citations = defaultdict(list)
+        for loss, citation in citations.items():
+            inverted_citations[citation].append(loss)
+
+        def join_list(losses: list[str]) -> str:
+            if len(losses) > 1:
+                return ", ".join(losses[:-1]) + " and " + losses[-1]
+            return losses[0]
+
+        self.citations = {
+            join_list(losses): citation
+            for citation, losses in inverted_citations.items()
+        }
+        self.add_tags(
+            [
+                f"loss:{loss}"
+                for loss in {loss.__class__.__name__: loss for loss in losses}
+            ]
+        )
+
+    def to_dict(self) -> dict[str, Any]:
+        # Try to set the base model
+        if self.first_save and not self.base_model:
+            try:
+                self.try_to_set_base_model()
+            except Exception:
+                pass
+
+        # Set the model name
+        if not self.model_name:
+            if self.base_model:
+                self.model_name = f"PyLate model based on {self.base_model}"
+            else:
+                self.model_name = "PyLate"
+
+        super_dict = {field.name: getattr(self, field.name) for field in fields(self)}
+
+        # Compute required formats from the (usually post-training) evaluation data
+        if self.eval_results_dict:
+            try:
+                super_dict.update(self.format_eval_metrics())
+            except Exception as exc:
+                logger.warning(f"Error while formatting evaluation metrics: {exc}")
+                raise exc
+
+        # Compute required formats for the during-training evaluation data
+        if self.training_logs:
+            try:
+                super_dict.update(self.format_training_logs())
+            except Exception as exc:
+                logger.warning(f"Error while formatting training logs: {exc}")
+
+        super_dict["hide_eval_lines"] = len(self.training_logs) > 100
+
+        # Try to add the code carbon callback data
+        if (
+            self.code_carbon_callback
+            and self.code_carbon_callback.tracker
+            and self.code_carbon_callback.tracker._start_time is not None
+        ):
+            super_dict.update(self.get_codecarbon_data())
+
+        # Add some additional metadata stored in the model itself
+        super_dict["document_length"] = self.model.document_length
+        super_dict["query_length"] = self.model.query_length
+        super_dict["output_dimensionality"] = (
+            self.model.get_sentence_embedding_dimension()
+        )
+        super_dict["model_string"] = str(self.model)
+        if self.model.similarity_fn_name:
+            super_dict["similarity_fn_name"] = {
+                "cosine": "Cosine Similarity",
+                "dot": "Dot Product",
+                "euclidean": "Euclidean Distance",
+                "manhattan": "Manhattan Distance",
+            }.get(
+                self.model.similarity_fn_name,
+                self.model.similarity_fn_name.replace("_", " ").title(),
+            )
+        else:
+            super_dict["similarity_fn_name"] = "Cosine Similarity"
+
+        self.first_save = False
+
+        for key in IGNORED_FIELDS:
+            super_dict.pop(key, None)
+        return super_dict
+
+
+def generate_model_card(model: SentenceTransformer) -> str:
+    template_path = Path(__file__).parent / "model_card_template.md"
+    model_card = ModelCard.from_template(
+        card_data=model.model_card_data, template_path=template_path, hf_emoji="🐕"
+    )
+    return model_card.content
diff --git a/pylate/hf_hub/model_card_template.md b/pylate/hf_hub/model_card_template.md
new file mode 100644
index 0000000..0eeefcb
--- /dev/null
+++ b/pylate/hf_hub/model_card_template.md
@@ -0,0 +1,329 @@
+---
+# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/model-cards
+{{ card_data }}
+---
+
+# {{ model_name if model_name else "PyLate model" }}
+
+This is a [PyLate](https://github.com/lightonai/pylate) model{% if base_model %} finetuned from [{{ base_model }}](https://huggingface.co/{{ base_model }}){% else %} trained{% endif %}{% if train_datasets | selectattr("name") | list %} on the {% for dataset in (train_datasets | selectattr("name")) %}{% if dataset.id %}[{{ dataset.name if dataset.name else dataset.id }}](https://huggingface.co/datasets/{{ dataset.id }}){% else %}{{ dataset.name }}{% endif %}{% if not loop.last %}{% if loop.index == (train_datasets | selectattr("name") | list | length - 1) %} and {% else %}, {% endif %}{% endif %}{% endfor %} dataset{{"s" if train_datasets | selectattr("name") | list | length > 1 else ""}}{% endif %}. It maps sentences & paragraphs to sequences of {{ output_dimensionality }}-dimensional dense vectors and can be used for semantic textual similarity using the MaxSim operator.
+
+## Model Details
+
+### Model Description
+- **Model Type:** PyLate model
+{% if base_model -%}
+    {%- if base_model_revision -%}
+    - **Base model:** [{{ base_model }}](https://huggingface.co/{{ base_model }}) <!-- at revision {{ base_model_revision }} -->
+    {%- else -%}
+    - **Base model:** [{{ base_model }}](https://huggingface.co/{{ base_model }})
+    {%- endif -%}
+{%- else -%}
+    <!-- - **Base model:** [Unknown](https://huggingface.co/unknown) -->
+{%- endif %}
+- **Document Length:** {{ document_length }} tokens
+- **Query Length:** {{ query_length }} tokens
+- **Output Dimensionality:** {{ output_dimensionality }} tokens
+- **Similarity Function:** MaxSim
+{% if train_datasets | selectattr("name") | list -%}
+    - **Training Dataset{{"s" if train_datasets | selectattr("name") | list | length > 1 else ""}}:**
+    {%- for dataset in (train_datasets | selectattr("name")) %}
+        {%- if dataset.id %}
+    - [{{ dataset.name if dataset.name else dataset.id }}](https://huggingface.co/datasets/{{ dataset.id }})
+        {%- else %}
+    - {{ dataset.name }}
+        {%- endif %}
+    {%- endfor %}
+{%- else -%}
+    <!-- - **Training Dataset:** Unknown -->
+{%- endif %}
+{% if language -%}
+    - **Language{{"s" if language is not string and language | length > 1 else ""}}:**
+    {%- if language is string %} {{ language }}
+    {%- else %} {% for lang in language -%}
+            {{ lang }}{{ ", " if not loop.last else "" }}
+        {%- endfor %}
+    {%- endif %}
+{%- else -%}
+    <!-- - **Language:** Unknown -->
+{%- endif %}
+{% if license -%}
+    - **License:** {{ license }}
+{%- else -%}
+    <!-- - **License:** Unknown -->
+{%- endif %}
+
+### Model Sources
+
+- **Documentation:** [PyLate Documentation](https://lightonai.github.io/pylate/)
+- **Repository:** [PyLate on GitHub](https://github.com/lightonai/pylate)
+- **Hugging Face:** [PyLate models on Hugging Face](https://huggingface.co/models?library=PyLate)
+
+### Full Model Architecture
+
+```
+{{ model_string }}
+```
+
+## Usage
+First install the PyLate library:
+
+```bash
+pip install -U pylate
+```
+
+### Retrieval 
+
+PyLate provides a streamlined interface to index and retrieve documents using ColBERT models. The index leverages the Voyager HNSW index to efficiently handle document embeddings and enable fast retrieval.
+
+#### Indexing documents
+
+First, load the ColBERT model and initialize the Voyager index, then encode and index your documents:
+
+```python
+from pylate import indexes, models, retrieve
+
+# Step 1: Load the ColBERT model
+model = models.ColBERT(
+    model_name_or_path={{ model_id | default('pylate_model_id', true) }},
+)
+
+# Step 2: Initialize the Voyager index
+index = indexes.Voyager(
+    index_folder="pylate-index",
+    index_name="index",
+    override=True,  # This overwrites the existing index if any
+)
+
+# Step 3: Encode the documents
+documents_ids = ["1", "2", "3"]
+documents = ["document 1 text", "document 2 text", "document 3 text"]
+
+documents_embeddings = model.encode(
+    documents,
+    batch_size=32,
+    is_query=False,  # Ensure that it is set to False to indicate that these are documents, not queries
+    show_progress_bar=True,
+)
+
+# Step 4: Add document embeddings to the index by providing embeddings and corresponding ids
+index.add_documents(
+    documents_ids=documents_ids,
+    documents_embeddings=documents_embeddings,
+)
+```
+
+Note that you do not have to recreate the index and encode the documents every time. Once you have created an index and added the documents, you can re-use the index later by loading it:
+
+```python
+# To load an index, simply instantiate it with the correct folder/name and without overriding it
+index = indexes.Voyager(
+    index_folder="pylate-index",
+    index_name="index",
+)
+```
+
+#### Retrieving top-k documents for queries
+
+Once the documents are indexed, you can retrieve the top-k most relevant documents for a given set of queries.
+To do so, initialize the ColBERT retriever with the index you want to search in, encode the queries and then retrieve the top-k documents to get the top matches ids and relevance scores:
+
+```python
+# Step 1: Initialize the ColBERT retriever
+retriever = retrieve.ColBERT(index=index)
+
+# Step 2: Encode the queries
+queries_embeddings = model.encode(
+    ["query for document 3", "query for document 1"],
+    batch_size=32,
+    is_query=True,  #  # Ensure that it is set to False to indicate that these are queries
+    show_progress_bar=True,
+)
+
+# Step 3: Retrieve top-k documents
+scores = retriever.retrieve(
+    queries_embeddings=queries_embeddings, 
+    k=10,  # Retrieve the top 10 matches for each query
+)
+```
+
+### Reranking
+If you only want to use the ColBERT model to perform reranking on top of your first-stage retrieval pipeline without building an index, you can simply use rank function and pass the queries and documents to rerank:
+
+```python
+from pylate import rank, models
+
+queries = [
+    "query A",
+    "query B",
+]
+
+documents = [
+    ["document A", "document B"],
+    ["document 1", "document C", "document B"],
+]
+
+documents_ids = [
+    [1, 2],
+    [1, 3, 2],
+]
+
+model = models.ColBERT(
+    model_name_or_path={{ model_id | default('pylate_model_id', true) }},
+)
+
+queries_embeddings = model.encode(
+    queries,
+    is_query=True,
+)
+
+documents_embeddings = model.encode(
+    documents,
+    is_query=False,
+)
+
+reranked_documents = rank.rerank(
+    documents_ids=documents_ids,
+    queries_embeddings=queries_embeddings,
+    documents_embeddings=documents_embeddings,
+)
+```
+
+<!--
+### Direct Usage (Transformers)
+
+<details><summary>Click to see the direct usage in Transformers</summary>
+
+</details>
+-->
+
+<!--
+### Downstream Usage (Sentence Transformers)
+
+You can finetune this model on your own dataset.
+
+<details><summary>Click to expand</summary>
+
+</details>
+-->
+
+<!--
+### Out-of-Scope Use
+
+*List how the model may foreseeably be misused and address what users ought not to do with the model.*
+-->
+{% if eval_metrics %}
+## Evaluation
+
+### Metrics
+{% for metrics in eval_metrics %}
+#### {{ metrics.description }}
+{% if metrics.dataset_name %}* Dataset: `{{ metrics.dataset_name }}`{% endif %}
+* Evaluated with {% if metrics.class_name.startswith("sentence_transformers.") %}[<code>{{ metrics.class_name.split(".")[-1] }}</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.{{ metrics.class_name.split(".")[-1] }}){% else %}<code>{{ metrics.class_name }}</code>{% endif %}
+
+{{ metrics.table }}
+{%- endfor %}{% endif %}
+<!--
+## Bias, Risks and Limitations
+
+*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
+-->
+
+<!--
+### Recommendations
+
+*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
+-->
+
+## Training Details
+{% for dataset_type, dataset_list in [("training", train_datasets), ("evaluation", eval_datasets)] %}{% if dataset_list %}
+### {{ dataset_type.title() }} Dataset{{"s" if dataset_list | length > 1 else ""}}
+{% for dataset in dataset_list %}
+#### {{ dataset['name'] or 'Unnamed Dataset' }}
+
+{% if dataset['name'] %}* Dataset: {% if 'id' in dataset %}[{{ dataset['name'] }}](https://huggingface.co/datasets/{{ dataset['id'] }}){% else %}{{ dataset['name'] }}{% endif %}
+{%- if 'revision' in dataset and 'id' in dataset %} at [{{ dataset['revision'][:7] }}](https://huggingface.co/datasets/{{ dataset['id'] }}/tree/{{ dataset['revision'] }}){% endif %}{% endif %}
+{% if dataset['size'] %}* Size: {{ "{:,}".format(dataset['size']) }} {{ dataset_type }} samples
+{% endif %}* Columns: {% if dataset['columns'] | length == 1 %}{{ dataset['columns'][0] }}{% elif dataset['columns'] | length == 2 %}{{ dataset['columns'][0] }} and {{ dataset['columns'][1] }}{% else %}{{ dataset['columns'][:-1] | join(', ') }}, and {{ dataset['columns'][-1] }}{% endif %}
+{% if dataset['stats_table'] %}* Approximate statistics based on the first {{ [dataset['size'], 1000] | min }} samples:
+{{ dataset['stats_table'] }}{% endif %}{% if dataset['examples_table'] %}* Samples:
+{{ dataset['examples_table'] }}{% endif %}* Loss: {% if dataset["loss"]["fullname"].startswith("sentence_transformers.") %}[<code>{{ dataset["loss"]["fullname"].split(".")[-1] }}</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#{{ dataset["loss"]["fullname"].split(".")[-1].lower() }}){% else %}<code>{{ dataset["loss"]["fullname"] }}</code>{% endif %}{% if "config_code" in dataset["loss"] %} with these parameters:
+{{ dataset["loss"]["config_code"] }}{% endif %}
+{% endfor %}{% endif %}{% endfor -%}
+
+{% if all_hyperparameters %}
+### Training Hyperparameters
+{% if non_default_hyperparameters -%}
+#### Non-Default Hyperparameters
+
+{% for name, value in non_default_hyperparameters.items() %}- `{{ name }}`: {{ value }}
+{% endfor %}{%- endif %}
+#### All Hyperparameters
+<details><summary>Click to expand</summary>
+
+{% for name, value in all_hyperparameters.items() %}- `{{ name }}`: {{ value }}
+{% endfor %}
+</details>
+{% endif %}
+
+{%- if eval_lines %}
+### Training Logs
+{% if hide_eval_lines %}<details><summary>Click to expand</summary>
+
+{% endif -%}
+{{ eval_lines }}{% if explain_bold_in_eval %}
+* The bold row denotes the saved checkpoint.{% endif %}
+{%- if hide_eval_lines %}
+</details>{% endif %}
+{% endif %}
+
+{%- if co2_eq_emissions %}
+### Environmental Impact
+Carbon emissions were measured using [CodeCarbon](https://github.com/mlco2/codecarbon).
+- **Energy Consumed**: {{ "%.3f"|format(co2_eq_emissions["energy_consumed"]) }} kWh
+- **Carbon Emitted**: {{ "%.3f"|format(co2_eq_emissions["emissions"] / 1000) }} kg of CO2
+- **Hours Used**: {{ co2_eq_emissions["hours_used"] }} hours
+
+### Training Hardware
+- **On Cloud**: {{ "Yes" if co2_eq_emissions["on_cloud"] else "No" }}
+- **GPU Model**: {{ co2_eq_emissions["hardware_used"] or "No GPU used" }}
+- **CPU Model**: {{ co2_eq_emissions["cpu_model"] }}
+- **RAM Size**: {{ "%.2f"|format(co2_eq_emissions["ram_total_size"]) }} GB
+{% endif %}
+### Framework Versions
+- Python: {{ version["python"] }}
+- Sentence Transformers: {{ version["sentence_transformers"] }}
+- PyLate: {{ version["pylate"] }}
+- Transformers: {{ version["transformers"] }}
+- PyTorch: {{ version["torch"] }}
+- Accelerate: {{ version["accelerate"] }}
+- Datasets: {{ version["datasets"] }}
+- Tokenizers: {{ version["tokenizers"] }}
+
+
+## Citation
+
+### BibTeX
+{% for loss_name, citation in citations.items() %}
+#### {{ loss_name }}
+```bibtex
+{{ citation | trim }}
+```
+{% endfor %}
+<!--
+## Glossary
+
+*Clearly define terms in order to be accessible across audiences.*
+-->
+
+<!--
+## Model Card Authors
+
+*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
+-->
+
+<!--
+## Model Card Contact
+
+*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
+-->
\ No newline at end of file
diff --git a/pylate/models/colbert.py b/pylate/models/colbert.py
index ca64cbb..c57cb3b 100644
--- a/pylate/models/colbert.py
+++ b/pylate/models/colbert.py
@@ -13,10 +13,6 @@
 from numpy import ndarray
 from scipy.cluster import hierarchy
 from sentence_transformers import SentenceTransformer
-from sentence_transformers.model_card import (
-    SentenceTransformerModelCardData,
-    generate_model_card,
-)
 from sentence_transformers.models import Dense as DenseSentenceTransformer
 from sentence_transformers.models import Transformer
 from sentence_transformers.quantization import quantize_embeddings
@@ -25,6 +21,7 @@
 from torch import nn
 from tqdm.autonotebook import trange
 
+from ..hf_hub.model_card import PylateModelCardData, generate_model_card
 from ..utils import _start_multi_process_pool
 from .Dense import Dense
 
@@ -217,7 +214,7 @@ def __init__(
         model_kwargs: dict | None = None,
         tokenizer_kwargs: dict | None = None,
         config_kwargs: dict | None = None,
-        model_card_data: Optional[SentenceTransformerModelCardData] = None,
+        model_card_data: PylateModelCardData | None = None,
     ) -> None:
         self.query_prefix = query_prefix
         self.document_prefix = document_prefix
@@ -225,6 +222,7 @@ def __init__(
         self.document_length = document_length
         self.attend_to_expansion_tokens = attend_to_expansion_tokens
         self.skiplist_words = skiplist_words
+        model_card_data = model_card_data or PylateModelCardData()
 
         super(ColBERT, self).__init__(
             model_name_or_path=model_name_or_path,
diff --git a/tests/test_contrastive.py b/tests/test_contrastive.py
index 4edbbb7..52e1a8b 100644
--- a/tests/test_contrastive.py
+++ b/tests/test_contrastive.py
@@ -65,6 +65,8 @@ def test_contrastive_training() -> None:
 
     trainer.train()
 
+    model.save_pretrained("tests/contrastive/final")
+
     assert os.path.isdir("tests/contrastive")
 
     metrics = dev_evaluation(
diff --git a/tests/test_kd.py b/tests/test_kd.py
index 139876a..0cc02b8 100644
--- a/tests/test_kd.py
+++ b/tests/test_kd.py
@@ -63,6 +63,8 @@ def test_kd_training() -> None:
 
     trainer.train()
 
+    model.save_pretrained("tests/kd/final")
+
     assert os.path.isdir("tests/kd")
 
     if os.path.exists(path="tests/kd"):