[SPARKNLP-1105] Introducing AlbertForMultipleChoice

JohnSnowLabs · Dec 27, 2024 · 1ea76c3 · 1ea76c3
1 parent 8deb7da
commit 1ea76c3
Show file tree

Hide file tree

Showing 10 changed files with 4,188 additions and 68 deletions.
diff --git a/...ples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForMultipleChoice.ipynb b/...ples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForMultipleChoice.ipynb
diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForMultipleChoice.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForMultipleChoice.ipynb
@@ -91,30 +91,6 @@
     "- We'll use the treained model above as an example and load it as a `ORTModelForMultipleChoice`, representing an ONNX model."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "avTe8Oe5N-vw",
-    "outputId": "270cf088-de9d-4dd2-d0cf-56daba62e141"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
-     ]
-    }
-   ],
-   "source": [
-    "from google.colab import drive\n",
-    "drive.mount('/content/drive')"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -446,51 +422,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "al3szq-HRy2s",
-    "outputId": "a08dc94b-614a-44f8-daf1-98149d057011"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: pyspark in /usr/local/lib/python3.10/dist-packages (3.5.3)\n",
-      "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install pyspark"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "9ld2osF6STCv",
-    "outputId": "ad4bd7ce-b2f9-406c-bc47-63a18f8b1ee6"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processing ./spark_nlp-5.5.0-py2.py3-none-any.whl\n",
-      "Installing collected packages: spark-nlp\n",
-      "Successfully installed spark-nlp-5.5.0\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "!pip install spark_nlp-5.5.0-py2.py3-none-any.whl"
+    "!wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
    ]
   },
   {

diff --git a/...hon/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_AlbertForMultipleChoice.ipynb b/...hon/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_AlbertForMultipleChoice.ipynb
diff --git a/python/sparknlp/annotator/classifier_dl/__init__.py b/python/sparknlp/annotator/classifier_dl/__init__.py
@@ -55,3 +55,4 @@
 from sparknlp.annotator.classifier_dl.albert_for_zero_shot_classification import *
 from sparknlp.annotator.classifier_dl.camembert_for_zero_shot_classification import *
 from sparknlp.annotator.classifier_dl.bert_for_multiple_choice import *
+from sparknlp.annotator.classifier_dl.albert_for_multiple_choice import *
diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py b/python/sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py
@@ -0,0 +1,161 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from sparknlp.common import *
+
+class AlbertForMultipleChoice(AnnotatorModel,
+                              HasCaseSensitiveProperties,
+                              HasBatchedAnnotate,
+                              HasEngine,
+                              HasMaxSentenceLengthLimit):
+    """AlbertForMultipleChoice can load ALBERT Models with a multiple choice classification head on top
+    (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> spanClassifier = AlbertForMultipleChoice.pretrained() \\
+    ...     .setInputCols(["document_question", "document_context"]) \\
+    ...     .setOutputCol("answer")
+
+    The default model is ``"albert_base_uncased_multiple_choice"``, if no name is
+    provided.
+
+    For available pretrained models please see the `Models Hub
+    <https://sparknlp.org/models?task=Multiple+Choice>`__.
+
+    To see which models are compatible and how to import them see
+    `Import Transformers into Spark NLP 🚀
+    <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT, DOCUMENT``    ``CHUNK``
+    ====================== ======================
+
+    Parameters
+    ----------
+    batchSize
+        Batch size. Large values allows faster processing but requires more
+        memory, by default 8
+    caseSensitive
+        Whether to ignore case in tokens for embeddings matching, by default
+        False
+    maxSentenceLength
+        Max sentence length to process, by default 512
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = MultiDocumentAssembler() \\
+    ...     .setInputCols(["question", "context"]) \\
+    ...     .setOutputCols(["document_question", "document_context"])
+    >>> questionAnswering = AlbertForMultipleChoice.pretrained() \\
+    ...     .setInputCols(["document_question", "document_context"]) \\
+    ...     .setOutputCol("answer") \\
+    ...     .setCaseSensitive(False)
+    >>> pipeline = Pipeline().setStages([
+    ...     documentAssembler,
+    ...     questionAnswering
+    ... ])
+    >>> data = spark.createDataFrame([["The Eiffel Tower is located in which country??", "Germany, France, Italy"]]).toDF("question", "context")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("answer.result").show(truncate=False)
+    +--------------------+
+    |result              |
+    +--------------------+
+    |[France]             |
+    +--------------------+
+    """
+    name = "AlbertForMultipleChoice"
+
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
+
+    outputAnnotatorType = AnnotatorType.CHUNK
+
+    choicesDelimiter = Param(Params._dummy(),
+                             "choicesDelimiter",
+                             "Delimiter character use to split the choices",
+                             TypeConverters.toString)
+
+    def setChoicesDelimiter(self, value):
+        """Sets delimiter character use to split the choices
+
+        Parameters
+        ----------
+        value : string
+            Delimiter character use to split the choices
+        """
+        return self._set(caseSensitive=value)
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.AlbertForMultipleChoice",
+                 java_model=None):
+        super(AlbertForMultipleChoice, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            batchSize=4,
+            maxSentenceLength=512,
+            caseSensitive=False,
+            choicesDelimiter = ","
+        )
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        BertForQuestionAnswering
+            The restored model
+        """
+        from sparknlp.internal import _AlbertMultipleChoiceLoader
+        jModel = _AlbertMultipleChoiceLoader(folder, spark_session._jsparkSession)._java_obj
+        return AlbertForMultipleChoice(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="albert_base_uncased_multiple_choice", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default
+            "bert_base_uncased_multiple_choice"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        BertForQuestionAnswering
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(AlbertForMultipleChoice, name, lang, remote_loc)
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
@@ -67,6 +67,15 @@ def __init__(self, path, jspark):
         )
 
 
+class _AlbertMultipleChoiceLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_AlbertMultipleChoiceLoader, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.classifier.dl.AlbertForMultipleChoice.loadSavedModel",
+            path,
+            jspark,
+        )
+
+
 class _BertLoader(ExtendedJavaWrapper):
     def __init__(self, path, jspark, use_openvino=False):
         super(_BertLoader, self).__init__(

diff --git a/python/test/annotator/classifier_dl/albert_for_multiple_choice_test.py b/python/test/annotator/classifier_dl/albert_for_multiple_choice_test.py
@@ -0,0 +1,79 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import pytest
+
+from sparknlp.annotator.classifier_dl.albert_for_multiple_choice import AlbertForMultipleChoice
+from sparknlp.base import *
+from test.util import SparkContextForTest
+
+
+class AlbertForMultipleChoiceTestSetup(unittest.TestCase):
+    def setUp(self):
+
+        sparkNLPModelPath = "/media/danilo/Data/Danilo/JSL/models/transformers/spark-nlp"
+
+        self.spark = SparkContextForTest.spark
+        self.question = "The Eiffel Tower is located in which country?"
+        self.choices = "Germany, France, Italy"
+
+        self.spark = SparkContextForTest.spark
+        empty_df = self.spark.createDataFrame([[""]]).toDF("text")
+
+        document_assembler = MultiDocumentAssembler() \
+            .setInputCols(["question", "context"]) \
+            .setOutputCols(["document_question", "document_context"])
+
+        albert_for_multiple_choice = AlbertForMultipleChoice.load(sparkNLPModelPath + "/openvino/albert_multiple_choice_openvino") \
+            .setInputCols(["document_question", "document_context"]) \
+            .setOutputCol("answer")
+
+        pipeline = Pipeline(stages=[document_assembler, albert_for_multiple_choice])
+
+        self.pipeline_model = pipeline.fit(empty_df)
+
+
+# @pytest.mark.slow
+class AlbertForMultipleChoiceTest(AlbertForMultipleChoiceTestSetup, unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+        self.data = self.spark.createDataFrame([[self.question, self.choices]]).toDF("question","context")
+        self.data.show(truncate=False)
+
+    def test_run(self):
+        result_df = self.pipeline_model.transform(self.data)
+        result_df.show(truncate=False)
+        for row in result_df.collect():
+            self.assertTrue(row["answer"][0].result != "")
+
+
+# @pytest.mark.slow
+class LightAlbertForMultipleChoiceTest(AlbertForMultipleChoiceTestSetup, unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+
+    def runTest(self):
+        light_pipeline = LightPipeline(self.pipeline_model)
+        annotations_result = light_pipeline.fullAnnotate(self.question,self.choices)
+        print(annotations_result)
+        for result in annotations_result:
+            self.assertTrue(result["answer"][0].result != "")
+
+        result = light_pipeline.annotate(self.question,self.choices)
+        print(result)
+        self.assertTrue(result["answer"] != "")