Merge pull request #2211 from dhermes/language-impl-6

Adding annotate_text() method in language and helper classes needed for it
googleapis · Aug 29, 2016 · faf09c0 · faf09c0
2 parents a678774 + fd1ef8c
commit faf09c0
Show file tree

Hide file tree

Showing 7 changed files with 632 additions and 24 deletions.
diff --git a/docs/language-responses.rst b/docs/language-responses.rst
@@ -14,3 +14,10 @@ Sentiment
 .. automodule:: gcloud.language.sentiment
   :members:
   :show-inheritance:
+
+Syntax
+~~~~~~
+
+.. automodule:: gcloud.language.syntax
+  :members:
+  :show-inheritance:
diff --git a/docs/language-usage.rst b/docs/language-usage.rst
@@ -216,7 +216,7 @@ machine learning and need in-depth text features to build upon.
 The method returns a named tuple with four entries:
 
 * ``sentences``: A :class:`list` of sentences in the text
-* ``tokens``: A :class:`list` of :class:`~gcloud.language.token.Token`
+* ``tokens``: A :class:`list` of :class:`~gcloud.language.syntax.Token`
   object (e.g. words, punctuation)
 * ``sentiment``: The :class:`~gcloud.language.sentiment.Sentiment` of
   the text (as returned by

diff --git a/gcloud/language/document.py b/gcloud/language/document.py
@@ -17,14 +17,38 @@
 A document is used to hold text to be analyzed and annotated.
 """
 
+import collections
+
 from gcloud.language.entity import Entity
 from gcloud.language.sentiment import Sentiment
+from gcloud.language.syntax import Sentence
+from gcloud.language.syntax import Token
 
 
 DEFAULT_LANGUAGE = 'en-US'
 """Default document language, English."""
 
 
+Annotations = collections.namedtuple(
+    'Annotations',
+    'sentences tokens sentiment entities')
+"""Annotations for a document.
+
+:type sentences: list
+:param sentences: List of :class:`.Sentence` in a document.
+
+:type tokens: list
+:param tokens: List of :class:`.Token` from a document.
+
+:type sentiment: :class:`Sentiment`
+:param sentiment: The sentiment of a document.
+
+:type entities: list
+:param entities: List of :class:`~.language.entity.Entity`
+                 found in a document.
+"""
+
+
 class Encoding(object):
     """Document text encoding types."""
 
@@ -163,3 +187,75 @@ def analyze_sentiment(self):
         api_response = self.client.connection.api_request(
             method='POST', path='analyzeSentiment', data=data)
         return Sentiment.from_api_repr(api_response['documentSentiment'])
+
+    def annotate_text(self, include_syntax=True, include_entities=True,
+                      include_sentiment=True):
+        """Advanced natural language API: document syntax and other features.
+
+        Includes the full functionality of :meth:`analyze_entities` and
+        :meth:`analyze_sentiment`, enabled by the flags
+        ``include_entities`` and ``include_sentiment`` respectively.
+
+        In addition ``include_syntax`` adds a new feature that analyzes
+        the document for semantic and syntacticinformation.
+
+        .. note::
+
+            This API is intended for users who are familiar with machine
+            learning and need in-depth text features to build upon.
+
+        .. _annotateText: https://cloud.google.com/natural-language/\
+                          reference/rest/v1beta1/documents/annotateText
+
+        See `annotateText`_.
+
+        :type include_syntax: bool
+        :param include_syntax: (Optional) Flag to enable syntax analysis
+                               of the current document.
+
+        :type include_entities: bool
+        :param include_entities: (Optional) Flag to enable entity extraction
+                                 from the current document.
+
+        :type include_sentiment: bool
+        :param include_sentiment: (Optional) Flag to enable sentiment
+                                  analysis of the current document.
+
+        :rtype: :class:`Annotations`
+        :returns: A tuple of each of the four values returned from the API:
+                  sentences, tokens, sentiment and entities.
+        """
+        features = {}
+        if include_syntax:
+            features['extractSyntax'] = True
+        if include_entities:
+            features['extractEntities'] = True
+        if include_sentiment:
+            features['extractDocumentSentiment'] = True
+
+        data = {
+            'document': self._to_dict(),
+            'features': features,
+            'encodingType': self.encoding,
+        }
+        api_response = self.client.connection.api_request(
+            method='POST', path='annotateText', data=data)
+
+        sentences = [Sentence.from_api_repr(sentence)
+                     for sentence in api_response['sentences']]
+        tokens = [Token.from_api_repr(token)
+                  for token in api_response['tokens']]
+        sentiment_info = api_response.get('documentSentiment')
+        if sentiment_info is None:
+            sentiment = None
+        else:
+            sentiment = Sentiment.from_api_repr(sentiment_info)
+        entities = [Entity.from_api_repr(entity)
+                    for entity in api_response['entities']]
+        annotations = Annotations(
+            sentences=sentences,
+            tokens=tokens,
+            sentiment=sentiment,
+            entities=entities,
+        )
+        return annotations
diff --git a/gcloud/language/sentiment.py b/gcloud/language/sentiment.py
@@ -28,7 +28,6 @@ class Sentiment(object):
 
     See `Sentiment message`_ and `Sentiment basics`_.
 
-
     :type polarity: float
     :param polarity: Polarity of the sentiment in the ``[-1.0, 1.0]`` range.
                      Larger numbers represent more positive sentiments.
@@ -45,7 +44,7 @@ def __init__(self, polarity, magnitude):
 
     @classmethod
     def from_api_repr(cls, payload):
-        """Convert an Sentiment from the JSON API into a :class:`Sentiment`.
+        """Convert a Sentiment from the JSON API into a :class:`Sentiment`.
 
         :param payload: dict
         :type payload: The value from the backend.

diff --git a/gcloud/language/syntax.py b/gcloud/language/syntax.py
@@ -0,0 +1,203 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Google Cloud Natural Language API helpers for tokenized text.
+
+The ``annotateText`` method, when used with the "syntax" feature,
+breaks a document down into tokens and sentences.
+"""
+
+
+class PartOfSpeech(object):
+    """Part of speech of a :class:`Token`."""
+
+    UNKNOWN = 'UNKNOWN'
+    """Unknown part of speech."""
+
+    ADJECTIVE = 'ADJ'
+    """Part of speech: Adjective."""
+
+    ADPOSITION = 'ADP'
+    """Adposition (preposition and postposition)."""
+
+    ADVERB = 'ADV'
+    """Adverb."""
+
+    CONJUNCTION = 'CONJ'
+    """Conjunction."""
+
+    DETERMINER = 'DET'
+    """Determiner."""
+
+    NOUN = 'NOUN'
+    """Noun (common and proper)."""
+
+    CARDINAL_NUMBER = 'NUM'
+    """Cardinal number."""
+
+    PRONOUN = 'PRON'
+    """Pronoun."""
+
+    PARTICIPLE = 'PRT'
+    """Particle or other function word."""
+
+    PUNCTUATION = 'PUNCT'
+    """Punctuation."""
+
+    VERB = 'VERB'
+    """Verb (all tenses and modes)."""
+
+    OTHER = 'X'
+    """Other: foreign words, typos, abbreviations."""
+
+    AFFIX = 'AFFIX'
+    """Affix."""
+
+    _REVERSE_MAP = {
+        'UNKNOWN': 'UNKNOWN',
+        'ADJ': 'ADJECTIVE',
+        'ADP': 'ADPOSITION',
+        'ADV': 'ADVERB',
+        'CONJ': 'CONJUNCTION',
+        'DET': 'DETERMINER',
+        'NOUN': 'NOUN',
+        'NUM': 'CARDINAL_NUMBER',
+        'PRON': 'PRONOUN',
+        'PRT': 'PARTICIPLE',
+        'PUNCT': 'PUNCTUATION',
+        'VERB': 'VERB',
+        'X': 'OTHER',
+        'AFFIX': 'AFFIX',
+    }
+
+    @classmethod
+    def reverse(cls, tag):
+        """Reverses the API's enum name for the one on this class.
+
+        For example::
+
+            >>> PartOfSpeech.OTHER
+            'X'
+            >>> PartOfSpeech.reverse('X')
+            'OTHER'
+
+        :rtype: str
+        :returns: The attribute name corresponding to the API part of
+                  speech enum.
+        """
+        return cls._REVERSE_MAP[tag]
+
+
+class Token(object):
+    """A Google Cloud Natural Language API token object.
+
+    .. _Token message: https://cloud.google.com/natural-language/reference\
+                       /rest/v1beta1/documents/annotateText#Token
+    .. _Lemma: https://en.wikipedia.org/wiki/Lemma_(morphology)
+    .. _Label enum: https://cloud.google.com/natural-language/reference/\
+                    rest/v1beta1/documents/annotateText#Label
+
+    See `Token message`_.
+
+    :type text_content: str
+    :param text_content: The text that the token is composed of.
+
+    :type text_begin: int
+    :param text_begin: The beginning offset of the content in the original
+                       document according to the encoding type specified
+                       in the API request.
+
+    :type part_of_speech: str
+    :param part_of_speech: The part of speech of the token. See
+                           :class:`PartOfSpeech` for possible values.
+
+    :type edge_index: int
+    :param edge_index: The head of this token in the dependency tree. This is
+                       the index of the token which has an arc going to this
+                       token. The index is the position of the token in the
+                       array of tokens returned by the API method. If this
+                       token is a root token, then the ``edge_index`` is
+                       its own index.
+
+    :type edge_label: str
+    :param edge_label: See `Label enum`_.
+
+    :type lemma: str
+    :param lemma: The `Lemma`_ of the token.
+    """
+
+    def __init__(self, text_content, text_begin, part_of_speech,
+                 edge_index, edge_label, lemma):
+        self.text_content = text_content
+        self.text_begin = text_begin
+        self.part_of_speech = part_of_speech
+        self.edge_index = edge_index
+        self.edge_label = edge_label
+        self.lemma = lemma
+
+    @classmethod
+    def from_api_repr(cls, payload):
+        """Convert a token from the JSON API into a :class:`Sentiment`.
+
+        :param payload: dict
+        :type payload: The value from the backend.
+
+        :rtype: :class:`Token`
+        :returns: The token parsed from the API representation.
+        """
+        text_span = payload['text']
+        text_content = text_span['content']
+        text_begin = text_span['beginOffset']
+        part_of_speech = payload['partOfSpeech']['tag']
+        edge = payload['dependencyEdge']
+        edge_index = edge['headTokenIndex']
+        edge_label = edge['label']
+        lemma = payload['lemma']
+        return cls(text_content, text_begin, part_of_speech,
+                   edge_index, edge_label, lemma)
+
+
+class Sentence(object):
+    """A Google Cloud Natural Language API sentence object.
+
+    .. _Sentence message: https://cloud.google.com/natural-language/reference\
+                          /rest/v1beta1/documents/annotateText#Sentence
+
+    See `Sentence message`_.
+
+    :type content: str
+    :param content: The text that the sentence is composed of.
+
+    :type begin: int
+    :param begin: The beginning offset of the sentence in the original
+                  document according to the encoding type specified
+                  in the API request.
+    """
+
+    def __init__(self, content, begin):
+        self.content = content
+        self.begin = begin
+
+    @classmethod
+    def from_api_repr(cls, payload):
+        """Convert a sentence from the JSON API into a :class:`Sentiment`.
+
+        :param payload: dict
+        :type payload: The value from the backend.
+
+        :rtype: :class:`Sentence`
+        :returns: The sentence parsed from the API representation.
+        """
+        text_span = payload['text']
+        return cls(text_span['content'], text_span['beginOffset'])