Skip to content

Commit 90a2782

Browse files
authored
Merge pull request #2172 from dhermes/language-impl-3
Adding Document.analyze_entities() in language package
2 parents 94c2743 + b61b82c commit 90a2782

File tree

10 files changed

+452
-24
lines changed

10 files changed

+452
-24
lines changed

docs/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@
157157
language-usage
158158
Client <language-client>
159159
language-document
160+
language-responses
160161

161162
.. toctree::
162163
:maxdepth: 0

docs/language-responses.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Natural Language Response Classes
2+
=================================
3+
4+
Entity
5+
~~~~~~
6+
7+
.. automodule:: gcloud.language.entity
8+
:members:
9+
:show-inheritance:

docs/language-usage.rst

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -171,25 +171,29 @@ metadata and other properties.
171171
>>> entities = document.analyze_entities()
172172
>>> for entity in entities:
173173
... print('=' * 20)
174-
... print(' name: %s' % (entity.name,))
175-
... print(' type: %s' % (entity.entity_type,))
176-
... print('metadata: %s' % (entity.metadata,))
177-
... print('salience: %s' % (entity.salience,))
174+
... print(' name: %s' % (entity.name,))
175+
... print(' type: %s' % (entity.entity_type,))
176+
... print('wikipedia_url: %s' % (entity.wikipedia_url,))
177+
... print(' metadata: %s' % (entity.metadata,))
178+
... print(' salience: %s' % (entity.salience,))
178179
====================
179-
name: Michelangelo Caravaggio
180-
type: PERSON
181-
metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/Caravaggio'}
182-
salience: 0.75942981
180+
name: Michelangelo Caravaggio
181+
type: PERSON
182+
wikipedia_url: http://en.wikipedia.org/wiki/Caravaggio
183+
metadata: {}
184+
salience: 0.7615959
183185
====================
184-
name: Italian
185-
type: LOCATION
186-
metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/Italy'}
187-
salience: 0.20193423
186+
name: Italian
187+
type: LOCATION
188+
wikipedia_url: http://en.wikipedia.org/wiki/Italy
189+
metadata: {}
190+
salience: 0.19960518
188191
====================
189-
name: The Calling of Saint Matthew
190-
type: WORK_OF_ART
191-
metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/index.html?curid=2838808'}
192-
salience: 0.03863598
192+
name: The Calling of Saint Matthew
193+
type: EVENT
194+
wikipedia_url: http://en.wikipedia.org/wiki/The_Calling_of_St_Matthew_(Caravaggio)
195+
metadata: {}
196+
salience: 0.038798928
193197
194198
Analyze Sentiment
195199
-----------------
@@ -266,14 +270,16 @@ the response is :data:`None`.
266270
>>> # Entities present if include_entities=True
267271
>>> for entity in annotations.entities:
268272
... print('=' * 20)
269-
... print(' name: %s' % (entity.name,))
270-
... print(' type: %s' % (entity.entity_type,))
271-
... print('metadata: %s' % (entity.metadata,))
272-
... print('salience: %s' % (entity.salience,))
273+
... print(' name: %s' % (entity.name,))
274+
... print(' type: %s' % (entity.entity_type,))
275+
... print('wikipedia_url: %s' % (entity.wikipedia_url,))
276+
... print(' metadata: %s' % (entity.metadata,))
277+
... print(' salience: %s' % (entity.salience,))
273278
====================
274-
name: Moon
275-
type: LOCATION
276-
metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/Natural_satellite'}
277-
salience: 0.11793101
279+
name: Moon
280+
type: LOCATION
281+
wikipedia_url: http://en.wikipedia.org/wiki/Natural_satellite
282+
metadata: {}
283+
salience: 0.11793101
278284
279285
.. _Features: https://cloud.google.com/natural-language/reference/rest/v1beta1/documents/annotateText#Features

gcloud/language/document.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
A document is used to hold text to be analyzed and annotated.
1818
"""
1919

20+
from gcloud.language.entity import Entity
21+
2022

2123
DEFAULT_LANGUAGE = 'en'
2224
"""Default document language, English."""
@@ -101,3 +103,44 @@ def __init__(self, client, content=None, gcs_url=None, doc_type=PLAIN_TEXT,
101103
self.doc_type = doc_type
102104
self.language = language
103105
self.encoding = encoding
106+
107+
def _to_dict(self):
108+
"""Helper to convert the current document into a dictionary.
109+
110+
To be used when constructing requests.
111+
112+
:rtype: dict
113+
:returns: The Document value as a JSON dictionary.
114+
"""
115+
info = {
116+
'type': self.doc_type,
117+
'language': self.language,
118+
}
119+
if self.content is not None:
120+
info['content'] = self.content
121+
elif self.gcs_url is not None:
122+
info['gcsContentUri'] = self.gcs_url
123+
return info
124+
125+
def analyze_entities(self):
126+
"""Analyze the entities in the current document.
127+
128+
Finds named entities (currently finds proper names as of August 2016)
129+
in the text, entity types, salience, mentions for each entity, and
130+
other properties.
131+
132+
See:
133+
https://cloud.google.com/natural-language/reference/\
134+
rest/v1beta1/documents/analyzeEntities
135+
136+
:rtype: list
137+
:returns: A list of :class:`Entity` returned from the API.
138+
"""
139+
data = {
140+
'document': self._to_dict(),
141+
'encodingType': self.encoding,
142+
}
143+
api_response = self.client.connection.api_request(
144+
method='POST', path='analyzeEntities', data=data)
145+
return [Entity.from_api_repr(entity)
146+
for entity in api_response['entities']]

gcloud/language/entity.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Copyright 2016 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Definition for Google Cloud Natural Language API entities.
16+
17+
An entity is used to describe a proper name extracted from text.
18+
"""
19+
20+
21+
class EntityType(object):
22+
"""List of possible entity types."""
23+
24+
UNKNOWN = 'UNKNOWN'
25+
"""Unknown entity type."""
26+
27+
PERSON = 'PERSON'
28+
"""Person entity type."""
29+
30+
LOCATION = 'LOCATION'
31+
"""Location entity type."""
32+
33+
ORGANIZATION = 'ORGANIZATION'
34+
"""Organization entity type."""
35+
36+
EVENT = 'EVENT'
37+
"""Event entity type."""
38+
39+
WORK_OF_ART = 'WORK_OF_ART'
40+
"""Work of art entity type."""
41+
42+
CONSUMER_GOOD = 'CONSUMER_GOOD'
43+
"""Consumer good entity type."""
44+
45+
OTHER = 'OTHER'
46+
"""Other entity type (i.e. known but not classified)."""
47+
48+
49+
class Entity(object):
50+
"""A Google Cloud Natural Language API entity.
51+
52+
Represents a phrase in text that is a known entity, such as a person,
53+
an organization, or location. The API associates information, such as
54+
salience and mentions, with entities.
55+
56+
The only supported metadata (as of August 2016) is ``wikipedia_url``,
57+
so this value will be removed from the passed in ``metadata``
58+
and put in its own property.
59+
60+
See:
61+
https://cloud.google.com/natural-language/reference/rest/v1beta1/Entity
62+
63+
:type name: str
64+
:param name: The name / phrase identified as the entity.
65+
66+
:type entity_type: str
67+
:param entity_type: The type of the entity. See
68+
https://cloud.google.com/natural-language/\
69+
reference/rest/v1beta1/Entity#Type
70+
71+
:type metadata: dict
72+
:param metadata: The metadata associated with the entity.
73+
74+
:type salience: float
75+
:param salience: The prominence of the entity / phrase within the text
76+
containing it.
77+
78+
:type mentions: list
79+
:param mentions: List of strings that mention the entity.
80+
"""
81+
82+
def __init__(self, name, entity_type, metadata, salience, mentions):
83+
self.name = name
84+
self.entity_type = entity_type
85+
self.wikipedia_url = metadata.pop('wikipedia_url', None)
86+
self.metadata = metadata
87+
self.salience = salience
88+
self.mentions = mentions
89+
90+
@classmethod
91+
def from_api_repr(cls, payload):
92+
"""Convert an Entity from the JSON API into an :class:`Entity`.
93+
94+
:param payload: dict
95+
:type payload: The value from the backend.
96+
97+
:rtype: :class:`Entity`
98+
:returns: The entity parsed from the API representation.
99+
"""
100+
name = payload['name']
101+
entity_type = payload['type']
102+
metadata = payload['metadata']
103+
salience = payload['salience']
104+
mentions = [value['text']['content']
105+
for value in payload['mentions']]
106+
return cls(name, entity_type, metadata, salience, mentions)

gcloud/language/test_document.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,124 @@ def test_constructor_text_and_gcs(self):
6262
with self.assertRaises(ValueError):
6363
self._makeOne(None, content='abc',
6464
gcs_url='gs://some-bucket/some-obj.txt')
65+
66+
def test__to_dict_with_content(self):
67+
klass = self._getTargetClass()
68+
content = 'Hello World'
69+
document = self._makeOne(None, content=content)
70+
info = document._to_dict()
71+
self.assertEqual(info, {
72+
'content': content,
73+
'language': document.language,
74+
'type': klass.PLAIN_TEXT,
75+
})
76+
77+
def test__to_dict_with_gcs(self):
78+
klass = self._getTargetClass()
79+
gcs_url = 'gs://some-bucket/some-obj.html'
80+
document = self._makeOne(None, gcs_url=gcs_url)
81+
info = document._to_dict()
82+
self.assertEqual(info, {
83+
'gcsContentUri': gcs_url,
84+
'language': document.language,
85+
'type': klass.PLAIN_TEXT,
86+
})
87+
88+
def test__to_dict_with_no_content(self):
89+
klass = self._getTargetClass()
90+
document = self._makeOne(None, content='')
91+
document.content = None # Manually unset the content.
92+
info = document._to_dict()
93+
self.assertEqual(info, {
94+
'language': document.language,
95+
'type': klass.PLAIN_TEXT,
96+
})
97+
98+
def test_analyze_entities(self):
99+
from gcloud.language.entity import Entity
100+
from gcloud.language.entity import EntityType
101+
102+
name1 = 'R-O-C-K'
103+
name2 = 'USA'
104+
content = name1 + ' in the ' + name2
105+
wiki2 = 'http://en.wikipedia.org/wiki/United_States'
106+
salience1 = 0.91391456
107+
salience2 = 0.086085409
108+
response = {
109+
'entities': [
110+
{
111+
'name': name1,
112+
'type': EntityType.OTHER,
113+
'metadata': {},
114+
'salience': salience1,
115+
'mentions': [
116+
{
117+
'text': {
118+
'content': name1,
119+
'beginOffset': -1
120+
}
121+
}
122+
]
123+
},
124+
{
125+
'name': name2,
126+
'type': EntityType.LOCATION,
127+
'metadata': {'wikipedia_url': wiki2},
128+
'salience': salience2,
129+
'mentions': [
130+
{
131+
'text': {
132+
'content': name2,
133+
'beginOffset': -1,
134+
},
135+
},
136+
],
137+
},
138+
],
139+
'language': 'en',
140+
}
141+
connection = _Connection(response)
142+
client = _Client(connection=connection)
143+
document = self._makeOne(client, content)
144+
145+
entities = document.analyze_entities()
146+
self.assertEqual(len(entities), 2)
147+
entity1 = entities[0]
148+
self.assertIsInstance(entity1, Entity)
149+
self.assertEqual(entity1.name, name1)
150+
self.assertEqual(entity1.entity_type, EntityType.OTHER)
151+
self.assertEqual(entity1.wikipedia_url, None)
152+
self.assertEqual(entity1.metadata, {})
153+
self.assertEqual(entity1.salience, salience1)
154+
self.assertEqual(entity1.mentions, [name1])
155+
entity2 = entities[1]
156+
self.assertIsInstance(entity2, Entity)
157+
self.assertEqual(entity2.name, name2)
158+
self.assertEqual(entity2.entity_type, EntityType.LOCATION)
159+
self.assertEqual(entity2.wikipedia_url, wiki2)
160+
self.assertEqual(entity2.metadata, {})
161+
self.assertEqual(entity2.salience, salience2)
162+
self.assertEqual(entity2.mentions, [name2])
163+
164+
# Verify the request.
165+
self.assertEqual(len(connection._requested), 1)
166+
req = connection._requested[0]
167+
self.assertEqual(req['path'], 'analyzeEntities')
168+
self.assertEqual(req['method'], 'POST')
169+
170+
171+
class _Connection(object):
172+
173+
def __init__(self, response):
174+
self._response = response
175+
self._requested = []
176+
177+
def api_request(self, **kwargs):
178+
self._requested.append(kwargs)
179+
return self._response
180+
181+
182+
class _Client(object):
183+
184+
def __init__(self, connection=None):
185+
self.connection = connection

0 commit comments

Comments
 (0)