[text analytics] add string-index-type support (#13378)

Azure · Aug 28, 2020 · 3891c08 · 3891c08
1 parent bd05a04
commit 3891c08
Show file tree

Hide file tree

Showing 450 changed files with 3,274 additions and 1,669 deletions.
diff --git a/sdk/textanalytics/azure-ai-textanalytics/azure/ai/textanalytics/_models.py b/sdk/textanalytics/azure-ai-textanalytics/azure/ai/textanalytics/_models.py
@@ -207,7 +207,9 @@ class CategorizedEntity(DictMixin):
     :ivar subcategory: Entity subcategory, such as Age/Year/TimeRange etc
     :vartype subcategory: str
     :ivar int offset: The entity text offset from the start of the document.
-    :ivar int length: The length of the entity text.
+        Returned in unicode code points.
+    :ivar int length: The length of the entity text. Returned
+        in unicode code points.
     :ivar confidence_score: Confidence score between 0 and 1 of the extracted
         entity.
     :vartype confidence_score: float
@@ -253,7 +255,9 @@ class PiiEntity(DictMixin):
     :ivar str subcategory: Entity subcategory, such as Credit Card/EU
         Phone number/ABA Routing Numbers, etc.
     :ivar int offset: The PII entity text offset from the start of the document.
-    :ivar int length: The length of the PII entity text.
+        Returned in unicode code points.
+    :ivar int length: The length of the PII entity text. Returned
+        in unicode code points.
     :ivar float confidence_score: Confidence score between 0 and 1 of the extracted
         entity.
     """
@@ -636,7 +640,9 @@ class LinkedEntityMatch(DictMixin):
     :vartype confidence_score: float
     :ivar text: Entity text as appears in the request.
     :ivar int offset: The linked entity match text offset from the start of the document.
-    :ivar int length: The length of the linked entity match text.
+        Returned in unicode code points.
+    :ivar int length: The length of the linked entity match text. Returned
+        in unicode code points.
     :vartype text: str
     """
 
@@ -738,8 +744,10 @@ class SentenceSentiment(DictMixin):
         and 1 for the sentence for all labels.
     :vartype confidence_scores:
         ~azure.ai.textanalytics.SentimentConfidenceScores
-    :ivar int offset: The sentence offset from the start of the document.
-    :ivar int length: The length of the sentence.
+    :ivar int offset: The sentence offset from the start of the document. Returned
+        in unicode code points.
+    :ivar int length: The length of the sentence. Returned
+        in unicode code points.
     :ivar mined_opinions: The list of opinions mined from this sentence.
         For example in "The food is good, but the service is bad", we would
         mind these two opinions "food is good", "service is bad". Only returned
@@ -847,8 +855,10 @@ class AspectSentiment(DictMixin):
         for 'neutral' will always be 0
     :vartype confidence_scores:
         ~azure.ai.textanalytics.SentimentConfidenceScores
-    :ivar int offset: The aspect offset from the start of the document.
-    :ivar int length: The length of the aspect.
+    :ivar int offset: The aspect offset from the start of the document. Returned
+        in unicode code points.
+    :ivar int length: The length of the aspect. Returned
+        in unicode code points.
     """
 
     def __init__(self, **kwargs):
@@ -892,8 +902,10 @@ class OpinionSentiment(DictMixin):
         for 'neutral' will always be 0
     :vartype confidence_scores:
         ~azure.ai.textanalytics.SentimentConfidenceScores
-    :ivar int offset: The opinion offset from the start of the document.
-    :ivar int length: The length of the opinion.
+    :ivar int offset: The opinion offset from the start of the document. Returned
+        in unicode code points.
+    :ivar int length: The length of the opinion. Returned
+        in unicode code points.
     :ivar bool is_negated: Whether the opinion is negated. For example, in
         "The food is not good", the opinion "good" is negated.
     """

diff --git a/sdk/textanalytics/azure-ai-textanalytics/azure/ai/textanalytics/_text_analytics_client.py b/sdk/textanalytics/azure-ai-textanalytics/azure/ai/textanalytics/_text_analytics_client.py
@@ -93,6 +93,7 @@ def __init__(self, endpoint, credential, **kwargs):
         )
         self._default_language = kwargs.pop("default_language", "en")
         self._default_country_hint = kwargs.pop("default_country_hint", "US")
+        self._string_code_unit = None if kwargs.get("api_version") == "v3.0" else "UnicodeCodePoint"
 
     @distributed_trace
     def detect_language(  # type: ignore
@@ -213,6 +214,8 @@ def recognize_entities(  # type: ignore
         docs = _validate_input(documents, "language", language)
         model_version = kwargs.pop("model_version", None)
         show_stats = kwargs.pop("show_stats", False)
+        if self._string_code_unit:
+            kwargs.update({"string_index_type": self._string_code_unit})
         try:
             return self._client.entities_recognition_general(
                 documents=docs,
@@ -278,6 +281,8 @@ def recognize_pii_entities(  # type: ignore
         docs = _validate_input(documents, "language", language)
         model_version = kwargs.pop("model_version", None)
         show_stats = kwargs.pop("show_stats", False)
+        if self._string_code_unit:
+            kwargs.update({"string_index_type": self._string_code_unit})
         try:
             return self._client.entities_recognition_pii(
                 documents=docs,
@@ -350,6 +355,8 @@ def recognize_linked_entities(  # type: ignore
         docs = _validate_input(documents, "language", language)
         model_version = kwargs.pop("model_version", None)
         show_stats = kwargs.pop("show_stats", False)
+        if self._string_code_unit:
+            kwargs.update({"string_index_type": self._string_code_unit})
         try:
             return self._client.entities_linking(
                 documents=docs,
@@ -490,6 +497,8 @@ def analyze_sentiment(  # type: ignore
         model_version = kwargs.pop("model_version", None)
         show_stats = kwargs.pop("show_stats", False)
         show_opinion_mining = kwargs.pop("show_opinion_mining", None)
+        if self._string_code_unit:
+            kwargs.update({"string_index_type": self._string_code_unit})
 
         if show_opinion_mining is not None:
             kwargs.update({"opinion_mining": show_opinion_mining})

diff --git a/...alytics/azure-ai-textanalytics/azure/ai/textanalytics/aio/_text_analytics_client_async.py b/...alytics/azure-ai-textanalytics/azure/ai/textanalytics/aio/_text_analytics_client_async.py
@@ -98,6 +98,7 @@ def __init__(  # type: ignore
         )
         self._default_language = kwargs.pop("default_language", "en")
         self._default_country_hint = kwargs.pop("default_country_hint", "US")
+        self._string_code_unit = None if kwargs.get("api_version") == "v3.0" else "UnicodeCodePoint"
 
     @distributed_trace_async
     async def detect_language(  # type: ignore
@@ -216,6 +217,8 @@ async def recognize_entities(  # type: ignore
         docs = _validate_input(documents, "language", language)
         model_version = kwargs.pop("model_version", None)
         show_stats = kwargs.pop("show_stats", False)
+        if self._string_code_unit:
+            kwargs.update({"string_index_type": self._string_code_unit})
         try:
             return await self._client.entities_recognition_general(
                 documents=docs,
@@ -280,6 +283,8 @@ async def recognize_pii_entities(  # type: ignore
         docs = _validate_input(documents, "language", language)
         model_version = kwargs.pop("model_version", None)
         show_stats = kwargs.pop("show_stats", False)
+        if self._string_code_unit:
+            kwargs.update({"string_index_type": self._string_code_unit})
         try:
             return await self._client.entities_recognition_pii(
                 documents=docs,
@@ -351,6 +356,8 @@ async def recognize_linked_entities(  # type: ignore
         docs = _validate_input(documents, "language", language)
         model_version = kwargs.pop("model_version", None)
         show_stats = kwargs.pop("show_stats", False)
+        if self._string_code_unit:
+            kwargs.update({"string_index_type": self._string_code_unit})
         try:
             return await self._client.entities_linking(
                 documents=docs,
@@ -489,6 +496,8 @@ async def analyze_sentiment(  # type: ignore
         model_version = kwargs.pop("model_version", None)
         show_stats = kwargs.pop("show_stats", False)
         show_opinion_mining = kwargs.pop("show_opinion_mining", None)
+        if self._string_code_unit:
+            kwargs.update({"string_index_type": self._string_code_unit})
 
         if show_opinion_mining is not None:
             kwargs.update({"opinion_mining": show_opinion_mining})

diff --git a/...xtanalytics/tests/recordings/test_analyze_sentiment.test_all_successful_passing_dict.yaml b/...xtanalytics/tests/recordings/test_analyze_sentiment.test_all_successful_passing_dict.yaml
@@ -19,7 +19,7 @@ interactions:
       User-Agent:
       - azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
     method: POST
-    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?showStats=true&stringIndexType=TextElements_v8
+    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?showStats=true&stringIndexType=UnicodeCodePoint
   response:
     body:
       string: '{"statistics":{"documentsCount":3,"validDocumentsCount":3,"erroneousDocumentsCount":0,"transactionsCount":3},"documents":[{"id":"1","sentiment":"neutral","statistics":{"charactersCount":51,"transactionsCount":1},"confidenceScores":{"positive":0.01,"neutral":0.99,"negative":0.0},"sentences":[{"sentiment":"neutral","confidenceScores":{"positive":0.01,"neutral":0.99,"negative":0.0},"offset":0,"length":51,"text":"Microsoft
@@ -30,21 +30,21 @@ interactions:
         recommend you try it."}],"warnings":[]}],"errors":[],"modelVersion":"2020-04-01"}'
     headers:
       apim-request-id:
-      - b1e4352f-1e0f-46e3-9f6e-5a82195726b5
+      - 546ef146-2055-49be-945d-8b4d95870565
       content-type:
       - application/json; charset=utf-8
       csp-billing-usage:
       - CognitiveServices.TextAnalytics.BatchScoring=3
       date:
-      - Wed, 26 Aug 2020 21:20:39 GMT
+      - Thu, 27 Aug 2020 19:31:50 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       transfer-encoding:
       - chunked
       x-content-type-options:
       - nosniff
       x-envoy-upstream-service-time:
-      - '91'
+      - '84'
     status:
       code: 200
       message: OK

diff --git a/...ts/recordings/test_analyze_sentiment.test_all_successful_passing_text_document_input.yaml b/...ts/recordings/test_analyze_sentiment.test_all_successful_passing_text_document_input.yaml
@@ -19,7 +19,7 @@ interactions:
       User-Agent:
       - azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
     method: POST
-    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?showStats=false&stringIndexType=TextElements_v8
+    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?showStats=false&stringIndexType=UnicodeCodePoint
   response:
     body:
       string: '{"documents":[{"id":"1","sentiment":"neutral","confidenceScores":{"positive":0.01,"neutral":0.99,"negative":0.0},"sentences":[{"sentiment":"neutral","confidenceScores":{"positive":0.01,"neutral":0.99,"negative":0.0},"offset":0,"length":51,"text":"Microsoft
@@ -30,21 +30,21 @@ interactions:
         recommend you try it."}],"warnings":[]}],"errors":[],"modelVersion":"2020-04-01"}'
     headers:
       apim-request-id:
-      - 36f47b42-b805-4655-9cc9-ed373487b586
+      - ee67d363-828c-4a5b-92ee-4a943a9aa020
       content-type:
       - application/json; charset=utf-8
       csp-billing-usage:
       - CognitiveServices.TextAnalytics.BatchScoring=3
       date:
-      - Wed, 26 Aug 2020 21:20:35 GMT
+      - Thu, 27 Aug 2020 19:31:50 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       transfer-encoding:
       - chunked
       x-content-type-options:
       - nosniff
       x-envoy-upstream-service-time:
-      - '83'
+      - '95'
     status:
       code: 200
       message: OK

diff --git a/.../azure-ai-textanalytics/tests/recordings/test_analyze_sentiment.test_bad_credentials.yaml b/.../azure-ai-textanalytics/tests/recordings/test_analyze_sentiment.test_bad_credentials.yaml
@@ -16,7 +16,7 @@ interactions:
       User-Agent:
       - azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
     method: POST
-    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?showStats=false&stringIndexType=TextElements_v8
+    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?showStats=false&stringIndexType=UnicodeCodePoint
   response:
     body:
       string: '{"error":{"code":"401","message":"Access denied due to invalid subscription
@@ -26,7 +26,7 @@ interactions:
       content-length:
       - '224'
       date:
-      - Wed, 26 Aug 2020 21:20:35 GMT
+      - Thu, 27 Aug 2020 19:31:56 GMT
     status:
       code: 401
       message: PermissionDenied

diff --git a/...i-textanalytics/tests/recordings/test_analyze_sentiment.test_bad_model_version_error.yaml b/...i-textanalytics/tests/recordings/test_analyze_sentiment.test_bad_model_version_error.yaml
@@ -16,26 +16,26 @@ interactions:
       User-Agent:
       - azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
     method: POST
-    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?model-version=bad&showStats=false&stringIndexType=TextElements_v8
+    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?model-version=bad&showStats=false&stringIndexType=UnicodeCodePoint
   response:
     body:
       string: '{"error":{"code":"InvalidRequest","message":"Invalid Request.","innererror":{"code":"ModelVersionIncorrect","message":"Invalid
         model version. Possible values are: latest,2019-10-01,2020-04-01"}}}'
     headers:
       apim-request-id:
-      - e98c3279-f8c4-49ce-b25c-f51289330fdd
+      - 600cfe88-8c7b-4017-a50e-ef0c30a546a4
       content-type:
       - application/json; charset=utf-8
       date:
-      - Wed, 26 Aug 2020 21:20:35 GMT
+      - Thu, 27 Aug 2020 19:31:56 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       transfer-encoding:
       - chunked
       x-content-type-options:
       - nosniff
       x-envoy-upstream-service-time:
-      - '10'
+      - '4'
     status:
       code: 400
       message: Bad Request

diff --git a/...-ai-textanalytics/tests/recordings/test_analyze_sentiment.test_batch_size_over_limit.yaml b/...-ai-textanalytics/tests/recordings/test_analyze_sentiment.test_batch_size_over_limit.yaml
@@ -760,26 +760,26 @@ interactions:
       User-Agent:
       - azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
     method: POST
-    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?showStats=false&stringIndexType=TextElements_v8
+    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?showStats=false&stringIndexType=UnicodeCodePoint
   response:
     body:
       string: '{"error":{"code":"InvalidRequest","message":"Invalid document in request.","innererror":{"code":"InvalidDocumentBatch","message":"Batch
         request contains too many records. Max 10 records are permitted."}}}'
     headers:
       apim-request-id:
-      - 5bcf6f2d-8a67-4bf7-a552-67c0c0ce9f9b
+      - e63eddb4-ac2c-4b1d-bfa8-ff78dc65076f
       content-type:
       - application/json; charset=utf-8
       date:
-      - Wed, 26 Aug 2020 21:20:36 GMT
+      - Thu, 27 Aug 2020 19:31:50 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       transfer-encoding:
       - chunked
       x-content-type-options:
       - nosniff
       x-envoy-upstream-service-time:
-      - '13'
+      - '12'
     status:
       code: 400
       message: Bad Request

diff --git a/...xtanalytics/tests/recordings/test_analyze_sentiment.test_batch_size_over_limit_error.yaml b/...xtanalytics/tests/recordings/test_analyze_sentiment.test_batch_size_over_limit_error.yaml
@@ -725,18 +725,18 @@ interactions:
       User-Agent:
       - azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
     method: POST
-    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?showStats=false&stringIndexType=TextElements_v8
+    uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/sentiment?showStats=false&stringIndexType=UnicodeCodePoint
   response:
     body:
       string: '{"error":{"code":"InvalidRequest","message":"Invalid document in request.","innererror":{"code":"InvalidDocumentBatch","message":"Batch
         request contains too many records. Max 10 records are permitted."}}}'
     headers:
       apim-request-id:
-      - 35aa5189-c6e8-46c5-9339-607d86aef6a1
+      - 22ce0f08-e152-4611-bf63-9cc9ae125568
       content-type:
       - application/json; charset=utf-8
       date:
-      - Wed, 26 Aug 2020 21:20:39 GMT
+      - Thu, 27 Aug 2020 19:31:50 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       transfer-encoding: