Skip to content

Commit edbffef

Browse files
committed
Adding token class for annotated natural language documents.
1 parent e8c064b commit edbffef

3 files changed

Lines changed: 215 additions & 0 deletions

File tree

docs/language-responses.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,10 @@ Sentiment
1414
.. automodule:: gcloud.language.sentiment
1515
:members:
1616
:show-inheritance:
17+
18+
Tokens
19+
~~~~~~
20+
21+
.. automodule:: gcloud.language.token
22+
:members:
23+
:show-inheritance:

gcloud/language/test_token.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Copyright 2016 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import unittest
16+
17+
18+
class TestPartOfSpeech(unittest.TestCase):
19+
20+
def _getTargetClass(self):
21+
from gcloud.language.token import PartOfSpeech
22+
return PartOfSpeech
23+
24+
def test_reverse(self):
25+
klass = self._getTargetClass()
26+
for attr in dir(klass):
27+
if attr.startswith('_'):
28+
continue
29+
if attr.islower():
30+
continue
31+
value = getattr(klass, attr)
32+
result = klass.reverse(value)
33+
self.assertEqual(result, attr)
34+
35+
36+
class TestToken(unittest.TestCase):
37+
38+
def _getTargetClass(self):
39+
from gcloud.language.token import Token
40+
return Token
41+
42+
def _makeOne(self, *args, **kw):
43+
return self._getTargetClass()(*args, **kw)
44+
45+
def test_constructor(self):
46+
from gcloud.language.token import PartOfSpeech
47+
48+
text_content = 'All'
49+
text_begin = -1
50+
part_of_speech = PartOfSpeech.DETERMINER
51+
edge_index = 3
52+
edge_label = 'PREDET'
53+
lemma = 'All'
54+
token = self._makeOne(text_content, text_begin, part_of_speech,
55+
edge_index, edge_label, lemma)
56+
self.assertEqual(token.text_content, text_content)
57+
self.assertEqual(token.text_begin, text_begin)
58+
self.assertEqual(token.part_of_speech, part_of_speech)
59+
self.assertEqual(token.edge_index, edge_index)
60+
self.assertEqual(token.edge_label, edge_label)
61+
self.assertEqual(token.lemma, lemma)

gcloud/language/token.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Copyright 2016 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Google Cloud Natural Language API helpers for tokenized text.
16+
17+
The ``annotateText`` method, when used with the "syntax" feature,
18+
breaks a document down into tokens and sentences.
19+
"""
20+
21+
22+
class PartOfSpeech(object):
23+
"""Part of speech of a :class:`Token`."""
24+
25+
UNKNOWN = 'UNKNOWN'
26+
"""Unknown part of speech."""
27+
28+
ADJECTIVE = 'ADJ'
29+
"""Part of speech: Adjective."""
30+
31+
ADPOSITION = 'ADP'
32+
"""Adposition (preposition and postposition)."""
33+
34+
ADVERB = 'ADV'
35+
"""Adverb."""
36+
37+
CONJUNCTION = 'CONJ'
38+
"""Conjunction."""
39+
40+
DETERMINER = 'DET'
41+
"""Determiner."""
42+
43+
NOUN = 'NOUN'
44+
"""Noun (common and proper)."""
45+
46+
CARDINAL_NUMBER = 'NUM'
47+
"""Cardinal number."""
48+
49+
PRONOUN = 'PRON'
50+
"""Pronoun."""
51+
52+
PARTICIPLE = 'PRT'
53+
"""Particle or other function word."""
54+
55+
PUNCTUATION = 'PUNCT'
56+
"""Punctuation."""
57+
58+
VERB = 'VERB'
59+
"""Verb (all tenses and modes)."""
60+
61+
OTHER = 'X'
62+
"""Other: foreign words, typos, abbreviations."""
63+
64+
AFFIX = 'AFFIX'
65+
"""Affix."""
66+
67+
_REVERSE_MAP = {
68+
'UNKNOWN': 'UNKNOWN',
69+
'ADJ': 'ADJECTIVE',
70+
'ADP': 'ADPOSITION',
71+
'ADV': 'ADVERB',
72+
'CONJ': 'CONJUNCTION',
73+
'DET': 'DETERMINER',
74+
'NOUN': 'NOUN',
75+
'NUM': 'CARDINAL_NUMBER',
76+
'PRON': 'PRONOUN',
77+
'PRT': 'PARTICIPLE',
78+
'PUNCT': 'PUNCTUATION',
79+
'VERB': 'VERB',
80+
'X': 'OTHER',
81+
'AFFIX': 'AFFIX',
82+
}
83+
84+
@classmethod
85+
def reverse(cls, tag):
86+
"""Reverses the API's enum name for the one on this class.
87+
88+
For example::
89+
90+
>>> PartOfSpeech.OTHER
91+
'X'
92+
>>> PartOfSpeech.reverse('X')
93+
'OTHER'
94+
95+
:rtype: str
96+
:returns: The attribute name corresponding to the API part of
97+
speech enum.
98+
"""
99+
return cls._REVERSE_MAP[tag]
100+
101+
102+
class Token(object):
103+
"""A Google Cloud Natural Language API token object.
104+
105+
.. _Token message: https://cloud.google.com/natural-language/reference\
106+
/rest/v1beta1/documents/annotateText#Token
107+
.. _Lemma: https://en.wikipedia.org/wiki/Lemma_(morphology)
108+
.. _Label enum: https://cloud.google.com/natural-language/reference/\
109+
rest/v1beta1/documents/annotateText#Label
110+
111+
See `Token message`_.
112+
113+
:type text_content: str
114+
:param text_content: The text that the token is composed of.
115+
116+
:type text_begin: int
117+
:param text_begin: The beginning offset of the content in the original
118+
document according to the encoding type specified
119+
in the API request.
120+
121+
:type part_of_speech: str
122+
:param part_of_speech: The part of speech of the token. See
123+
:class:`PartOfSpeech` for possible values.
124+
125+
:type edge_index: int
126+
:param edge_index: The head of this token in the dependency tree. This is
127+
the index of the token which has an arc going to this
128+
token. The index is the position of the token in the
129+
array of tokens returned by the API method. If this
130+
token is a root token, then the ``edge_index`` is
131+
its own index.
132+
133+
:type edge_label: str
134+
:param edge_label: See `Label enum`_.
135+
136+
:type lemma: str
137+
:param lemma: The `Lemma`_ of the token.
138+
"""
139+
140+
def __init__(self, text_content, text_begin, part_of_speech,
141+
edge_index, edge_label, lemma):
142+
self.text_content = text_content
143+
self.text_begin = text_begin
144+
self.part_of_speech = part_of_speech
145+
self.edge_index = edge_index
146+
self.edge_label = edge_label
147+
self.lemma = lemma

0 commit comments

Comments
 (0)