From dfaabd40fe7e726b2eb7c77a6d25f9a71c3c0ece Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Sat, 1 Jul 2023 20:06:03 +0700 Subject: [PATCH] 2023-06-08-instructor_base_en (#13850) * Add model 2023-06-08-instructor_base_en * Update 2023-06-08-instructor_base_en.md * Add model 2023-06-21-e5_base_v2_en * Add model 2023-06-21-e5_base_en * Add model 2023-06-21-e5_large_v2_en * Add model 2023-06-21-e5_large_en * Add model 2023-06-21-e5_small_v2_en * Add model 2023-06-21-e5_small_en * Add model 2023-06-21-instructor_large_en --------- Co-authored-by: prabod Co-authored-by: Maziyar Panahi --- .../prabod/2023-06-08-instructor_base_en.md | 75 +++++++++++++++++++ docs/_posts/prabod/2023-06-21-e5_base_en.md | 71 ++++++++++++++++++ .../_posts/prabod/2023-06-21-e5_base_v2_en.md | 68 +++++++++++++++++ docs/_posts/prabod/2023-06-21-e5_large_en.md | 71 ++++++++++++++++++ .../prabod/2023-06-21-e5_large_v2_en.md | 71 ++++++++++++++++++ docs/_posts/prabod/2023-06-21-e5_small_en.md | 71 ++++++++++++++++++ .../prabod/2023-06-21-e5_small_v2_en.md | 71 ++++++++++++++++++ .../prabod/2023-06-21-instructor_large_en.md | 74 ++++++++++++++++++ 8 files changed, 572 insertions(+) create mode 100644 docs/_posts/prabod/2023-06-08-instructor_base_en.md create mode 100644 docs/_posts/prabod/2023-06-21-e5_base_en.md create mode 100644 docs/_posts/prabod/2023-06-21-e5_base_v2_en.md create mode 100644 docs/_posts/prabod/2023-06-21-e5_large_en.md create mode 100644 docs/_posts/prabod/2023-06-21-e5_large_v2_en.md create mode 100644 docs/_posts/prabod/2023-06-21-e5_small_en.md create mode 100644 docs/_posts/prabod/2023-06-21-e5_small_v2_en.md create mode 100644 docs/_posts/prabod/2023-06-21-instructor_large_en.md diff --git a/docs/_posts/prabod/2023-06-08-instructor_base_en.md b/docs/_posts/prabod/2023-06-08-instructor_base_en.md new file mode 100644 index 00000000000000..01d6754fa28473 --- /dev/null +++ b/docs/_posts/prabod/2023-06-08-instructor_base_en.md @@ -0,0 +1,75 @@ +--- +layout: model +title: Instructor Base Sentence Embeddings +author: John Snow Labs +name: instructor_base +date: 2023-06-08 +tags: [instructor, sentence_embeddings, t5, text_semantic_similarity, text_reranking, sentence_similarity, en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: InstructorEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Instructor👨‍🏫, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, without any finetuning. Instructor👨‍ achieves sota on 70 diverse embedding tasks. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/instructor_base_en_5.0.0_3.0_1686224519068.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/instructor_base_en_5.0.0_3.0_1686224519068.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +instruction = InstructorEmbeddings.pretrained("instructor_base","en") \ + .setInstruction("Instruction here: ") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + + pipeline = Pipeline().setStages([document_assembler, instruction]) +``` +```scala + val embeddings = InstructorEmbeddings + .pretrained("instructor_base","en") + .setInstruction("Instruction here: ") + .setInputCols(Array("document")) + .setOutputCol("instructor") + + val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|instructor_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[instructor]| +|Language:|en| +|Size:|406.6 MB| + +## References + +https://huggingface.co/hkunlp/instructor-base diff --git a/docs/_posts/prabod/2023-06-21-e5_base_en.md b/docs/_posts/prabod/2023-06-21-e5_base_en.md new file mode 100644 index 00000000000000..d81452cf903590 --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_base_en.md @@ -0,0 +1,71 @@ +--- +layout: model +title: E5 Base Sentence Embeddings +author: John Snow Labs +name: e5_base +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_base_en_5.0.0_3.0_1687350215936.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_base_en_5.0.0_3.0_1687350215936.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_base","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_base","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|260.5 MB| + +## References + +https://huggingface.co/intfloat/e5-base \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-e5_base_v2_en.md b/docs/_posts/prabod/2023-06-21-e5_base_v2_en.md new file mode 100644 index 00000000000000..140496bade1a70 --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_base_v2_en.md @@ -0,0 +1,68 @@ +--- +layout: model +title: E5 Base v2 Sentence Embeddings +author: John Snow Labs +name: e5_base_v2 +date: 2023-06-21 +tags: [e5, sentence_embeddings, en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_base_v2_en_5.0.0_3.4_1687349803929.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_base_v2_en_5.0.0_3.4_1687349803929.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_base_v2","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_base_v2","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") + +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_base_v2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|260.6 MB| \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-e5_large_en.md b/docs/_posts/prabod/2023-06-21-e5_large_en.md new file mode 100644 index 00000000000000..e1bd6b18e30107 --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_large_en.md @@ -0,0 +1,71 @@ +--- +layout: model +title: E5 Large Sentence Embeddings +author: John Snow Labs +name: e5_large +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_large_en_5.0.0_3.0_1687350762773.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_large_en_5.0.0_3.0_1687350762773.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_large","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_large","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_large| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|799.1 MB| + +## References + +https://huggingface.co/intfloat/e5-large \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-e5_large_v2_en.md b/docs/_posts/prabod/2023-06-21-e5_large_v2_en.md new file mode 100644 index 00000000000000..10b99644a9dfac --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_large_v2_en.md @@ -0,0 +1,71 @@ +--- +layout: model +title: E5 Large V2 Sentence Embeddings +author: John Snow Labs +name: e5_large_v2 +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_large_v2_en_5.0.0_3.0_1687350498606.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_large_v2_en_5.0.0_3.0_1687350498606.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_large_v2","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_large_v2","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_large_v2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|799.1 MB| + +## References + +https://huggingface.co/intfloat/e5-large-v2 \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-e5_small_en.md b/docs/_posts/prabod/2023-06-21-e5_small_en.md new file mode 100644 index 00000000000000..018b4754b15d5e --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_small_en.md @@ -0,0 +1,71 @@ +--- +layout: model +title: E5 Small Sentence Embeddings +author: John Snow Labs +name: e5_small +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_small_en_5.0.0_3.0_1687351055229.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_small_en_5.0.0_3.0_1687351055229.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_small","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_small","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|80.9 MB| + +## References + +https://huggingface.co/intfloat/e5-small \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-e5_small_v2_en.md b/docs/_posts/prabod/2023-06-21-e5_small_v2_en.md new file mode 100644 index 00000000000000..4f7b015718f5c1 --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_small_v2_en.md @@ -0,0 +1,71 @@ +--- +layout: model +title: E5 Small V2 Sentence Embeddings +author: John Snow Labs +name: e5_small_v2 +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_small_v2_en_5.0.0_3.0_1687350926144.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_small_v2_en_5.0.0_3.0_1687350926144.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_small_v2","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_small_v2","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_small_v2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|80.9 MB| + +## References + +https://huggingface.co/intfloat/e5-small-v2 \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-instructor_large_en.md b/docs/_posts/prabod/2023-06-21-instructor_large_en.md new file mode 100644 index 00000000000000..fb040c3cf37918 --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-instructor_large_en.md @@ -0,0 +1,74 @@ +--- +layout: model +title: Instructor Large Sentence Embeddings +author: John Snow Labs +name: instructor_large +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: InstructorEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Instructor👨‍🏫, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, without any finetuning. Instructor👨‍ achieves sota on 70 diverse embedding tasks. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/instructor_large_en_5.0.0_3.0_1687351199226.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/instructor_large_en_5.0.0_3.0_1687351199226.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +instruction = InstructorEmbeddings.pretrained("instructor_large","en") \ + .setInstruction("Instruction here: ") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, instruction]) +``` +```scala + val embeddings = InstructorEmbeddings + .pretrained("instructor_large","en") + .setInstruction("Instruction here: ") + .setInputCols(Array("document")) + .setOutputCol("instructor") + val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|instructor_large| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[instructor]| +|Language:|en| +|Size:|1.2 GB| + +## References + +https://huggingface.co/hkunlp/instructor-large \ No newline at end of file