Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions datacommons/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
# limitations under the License.

# Data Commons SPARQL query support
from datacommons.query import Query
from datacommons.query import query

# Data Commons Python Client API
from datacommons.core import get_property_labels, get_property_values, get_triples
from datacommons.places import get_places_in
from datacommons.populations import get_populations, get_observations, get_pop_obs
from datacommons.populations import get_populations, get_observations, get_pop_obs, get_place_obs

# Other utilities
from .utils import set_api_key, clean_frame, flatten_frame
147 changes: 139 additions & 8 deletions datacommons/populations.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,13 +338,144 @@ def get_pop_obs(dcid):

Each :obj:`Observation` is represented by a :code:`dict` that have the keys:

- :code:`measuredProp`
- :code:`observationDate`
- :code:`observationPeriod` (optional)
- :code:`measurementMethod` (optional)
- one of: :code:`measuredValue`, :code:`meanValue`, :code:`maxValue`,
:code:`minValue`, :code:`medianValue`

- :code:`measuredProp`: The property measured by the :obj:`Observation`.
- :code:`observationDate`: The date when the :obj:`Observation` was made.
- :code:`observationPeriod` (optional): The period over which the
:obj:`Observation` was made.
- :code:`measurementMethod` (optional): A field providing additional
information on how the :obj:`Observation` was collected.
- Additional fields that denote values measured by the :obj:`Observation`.
These may include the following: :code:`measuredValue`, :code:`meanValue`,
:code:`medianValue`, :code:`maxValue`, :code:`minValue`, :code:`sumValue`,
:code:`marginOfError`, :code:`stdError`, :code:`meanStdError`, and others.
"""
url = utils._API_ROOT + utils._API_ENDPOINTS['get_pop_obs'] + '?dcid={}'.format(dcid)
return utils._send_request(url, compress=True, post=False)
return utils._send_request(url, compress=True, post=False)

def get_place_obs(place_type, population_type, constraining_properties={}):
""" Returns all :obj:`StatisticalPopulation`'s and :obj:`Observation`'s for \
all places of the given :code:`place_type`.

Args:
place_type (:obj:`str`): The type of places to query
:obj:`StatisticalPopulation`'s and :obj:`Observation`'s for.
population_type (:obj:`str`): The population type of the
:obj:`StatisticalPopulation`
constraining_properties (:obj:`map` from :obj:`str` to :obj:`str`, optional):
A map from constraining property to the value that the
:obj:`StatisticalPopulation` should be constrained by.

Returns:
Given a :code:`Place` type (i.e. :obj:`State`, :obj:`County`, :obj:`City`),
a :code:`population_type` (i.e. :obj:`Person`), and optionally a set of
constraining properties defining the `obj`:`StatisticalPopulation`, this
function returns *all* :obj:`StatisticalPopulation`'s and
:obj:`Observation`'s for all places of the given type. See examples for more
details on how the format of the return value is structured.

Raises:
ValueError: If the payload returned by the Data Commons REST API is
malformed.

Examples:
We would like to get all :obj:`StatisticalPopulation` and
:obj:`Observations` for all places of type :obj:`City` where the populations
have a population type of :obj:`Person` is specified by the following
constraining properties.

- Persons should have `age <https://browser.datacommons.org/kg?dcid=age>`_
with value `Years5To17 <https://browser.datacommons.org/kg?dcid=Years5To17>`_
- Persons should have `placeOfBirth <https://browser.datacommons.org/kg?dcid=placeOfBirth>`_
with value BornInOtherStateInTheUnitedStates.

>>> props = {
... 'age': 'Years5To17',
... 'placeOfBirth': 'BornInOtherStateInTheUnitedStates'
... }
>>> get_place_obs('City', 'Person', constraining_properties=props)
[
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need a {} to hold per place data in the example

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_pop_obs returns a list of place observations instead of a dict keyed by place. This is since places is the only key in what's returned by the REST API.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you check the example in here: https://github.com/datacommonsorg/api-python/pull/94/files#diff-f06c30a8f8b853dcf991f4addad79d9fR144

It should be [{'name':xxx, 'place': '' ...}, {}]?

{
'name': 'Marcus Hook borough',
'place': 'geoId/4247344',
'populations': {
'dc/p/pq6frs32sfvk': {
'observations': [
{
'id': 'dc/o/0005qml1el8qh',
'marginOfError': 39,
'measuredProp': 'count',
'measuredValue': 67,
'measurementMethod': 'CensusACS5yrSurvey',
'observationDate': '2014',
'provenanceId': 'dc/3j71hj1',
'type': 'Observation'
},
{
'id': 'dc/o/wvskpk5vyjkhb',
'marginOfError': 33,
'measuredProp': 'count',
'measuredValue': 58,
'measurementMethod': 'CensusACS5yrSurvey',
'observationDate': '2015',
'provenanceId': 'dc/3j71hj1',
'type': 'Observation'
},
{
'id': 'dc/o/3h44trf3vyrm3',
'marginOfError': 36,
'measuredProp': 'count',
'measuredValue': 42,
'measurementMethod': 'CensusACS5yrSurvey',
'observationDate': '2011',
'provenanceId': 'dc/3j71hj1',
'type': 'Observation'
},
# More observations...
],
'provenanceId': 'dc/3j71hj1'
}
}
},
# Entries for more cities...
]

The value returned by :code:`get_place_obs` is a :obj:`list` of
:obj:`dict`'s. Each dictionary corresponds to :obj:`StatisticalPopulation`'s
matching the given :code:`population_type` and
:code:`constraining_properties` for a single place of the given
:code:`place_type`. The dictionary contains the following keys.

- :code:`name`: The name of the place being described.
- :code:`place`: The dcid associated with the place being described.
- :code:`populations`: A :obj:`dict` mapping :code:`StatisticalPopulation`
dcids to a a :obj:`dict` with a list of :code:`observations` and a the
:code:`provenanceId` identifying the source that defined the
:code:`StatisticalPopulation`.

Each :obj:`Observation` is represented by a :obj:`dict` with the following
keys.

- :code:`id`: The :code:`dcid` identifying the :obj:`Observation`.
- :code:`provenanceId`: The dcid identifying the source that defined this
:obj:`Observation`.
- :code:`type`: The type associated with the :obj:`Observation`.
- :code:`measuredProp`: The property measured by the :obj:`Observation`.
- :code:`observationDate`: The date when the :obj:`Observation` was made.
- :code:`observationPeriod` (optional): The period over which the
:obj:`Observation` was made.
- :code:`measurementMethod` (optional): A field identifying how the
:obj:`Observation` was made
- Additional fields that denote values measured by the :obj:`Observation`.
These may include the following: :code:`measuredValue`, :code:`meanValue`,
:code:`medianValue`, :code:`maxValue`, :code:`minValue`, :code:`sumValue`,
:code:`marginOfError`, :code:`stdError`, :code:`meanStdError`, and others.
"""
# Create the json payload and send it to the REST API.
pv = [{'property': k, 'value': v} for k, v in constraining_properties.items()]
url = utils._API_ROOT + utils._API_ENDPOINTS['get_place_obs']
payload = utils._send_request(url, req_json={
'place_type': place_type,
'population_type': population_type,
'pvs': pv,
}, compress=True)
return payload['places']
207 changes: 86 additions & 121 deletions datacommons/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
# limitations under the License.
""" Data Commons Python Client API Query Module.

Implements a wrapper object for sending SPARQL queries to the Data Commons
knowledge graph.
Implements functions for sending graph queries to the Data Commons knowledge
graph.
"""

from __future__ import absolute_import
Expand All @@ -26,132 +26,97 @@
import os
import requests

# -----------------------------------------------------------------------------
# Query Class
# -----------------------------------------------------------------------------
# ----------------------------- WRAPPER FUNCTIONS -----------------------------


class Query(object):
""" A wrapper object that performs a SPARQL query on the Data Commons graph.
def query(query_string, select=None):
""" Returns the results of executing a SPARQL query on the Data Commons graph.

Args:
**kwargs: Valid keyword arguments include the following. At least one
valid argument must be provided.

- `sparql` (:obj:`str`): The SPARQL query string.
query_string (:obj:`str`): The SPARQL query string.
select (:obj:`func` accepting a row in the query result): A function that
selects rows to be returned by :code:`query`. This function accepts a row
in the results of executing :code:`query_string` and return True if and
only if the row is to be returned by :code:`query`. The row passed in as
an argument is represented as a :obj:`dict` that maps a query variable in
:code:`query_string` to its value in the given row.

Returns:
A table, represented as a :obj:`list` of rows, resulting from executing the
given SPARQL query. Each row is a :obj:`dict` mapping query variable to its
value in the row. If `select` is not `None`, then a row is included in the
returned :obj:`list` if and only if `select` returns :obj:`True` for that
row.

Raises:
ValueError: If an invalid keyword argument is provided.
ValueError: If the payload returned by the Data Commons REST API is
malformed.

Example:
To construct a :obj:`Query` object, do the following.
Examples:
We would like to query for the name associated with three states identified
by their dcids
`California <https://browser.datacommons.org/kg?dcid=geoId/06>`_,
`Kentucky <https://browser.datacommons.org/kg?dcid=geoId/21>`_, and
`Maryland <https://browser.datacommons.org/kg?dcid=geoId/24>`_.

>>> query_str = '''
...SELECT ?name ?dcid
...WHERE {
... ?a typeOf Place .
... ?a name ?name .
... ?a dcid ("geoId/06" "geoId/21" "geoId/24") .
... ?a dcid ?dcid
...}
...'''
>>> query = dc.Query(sparql=query_str)
... SELECT ?name ?dcid
... WHERE {
... ?a typeOf Place .
... ?a name ?name .
... ?a dcid ("geoId/06" "geoId/21" "geoId/24") .
... ?a dcid ?dcid
... }
... '''
>>> result = query(query_str)
>>> for r in result:
... print(r)
{"?name": "Maryland", "?dcid": "geoId/24"}
{"?name": "Kentucky", "?dcid": "geoId/21"}
{"?name": "California", "?dcid": "geoId/06"}

Optionally, we can specify which rows are returned by setting :code:`select`
like so. The following returns all rows where the name is "Maryland".

>>> selector = lambda row: row['?name'] == 'Maryland'
>>> result = query(query_str, select=selector)
>>> for r in result:
... print(r)
{"?name": "Maryland", "?dcid": "geoId/24"}
"""

# Valid query languages
_SPARQL_LANG = 'sparql'
_VALID_LANG = [_SPARQL_LANG]

def __init__(self, **kwargs):
""" Initializes a SPARQL query targeting the Data Commons graph. """
if self._SPARQL_LANG in kwargs:
self._query = kwargs[self._SPARQL_LANG]
self._language = self._SPARQL_LANG
self._result = None
else:
lang_str = ', '.join(self._VALID_LANG)
raise ValueError(
'Must provide one of the following languages: {}'.format(lang_str))

def rows(self, select=None):
""" Returns the result of executing the query as an iterator over all rows.

Args:
select (:obj:`func` accepting a `row` in the query result): A function
that returns true if and only if a row in the query results should be
kept. The argument for this function is a :obj:`dict` from query
variable to its value in a given row.

Yields:
Rows from executing the query where each row is a :obj:`dict` mapping
query variable to its value in the row. If `select` is not `None`, then
the row is returned if and only if `select` returns :obj:`True`.

Example:
The following query asks for names of three states:
`California <https://browser.datacommons.org/kg?dcid=geoId/06>`_,
`Kentucky <https://browser.datacommons.org/kg?dcid=geoId/21>`_, and
`Maryland <https://browser.datacommons.org/kg?dcid=geoId/24>`_.

>>> query_str = '''
... SELECT ?name ?dcid
... WHERE {
... ?a typeOf Place .
... ?a name ?name .
... ?a dcid ("geoId/06" "geoId/21" "geoId/24") .
... ?a dcid ?dcid
... }
... '''
>>> query = dc.Query(sparql=query_str)
>>> for r in query.rows():
... print(r)
{"?name": "Maryland", "?dcid": "geoId/24"}
{"?name": "Kentucky", "?dcid": "geoId/21"}
{"?name": "California", "?dcid": "geoId/06"}
"""
# Execute the query if the results are empty.
if not self._result:
self._execute()

# Iterate through the query results
header = self._result['header']
for row in self._result['rows']:
# Construct the map from query variable to cell value.
row_map = {}
for idx, cell in enumerate(row['cells']):
if idx > len(header):
raise RuntimeError(
'Query error: unexpected cell {}'.format(cell))
if 'value' not in cell:
raise RuntimeError(
'Query error: cell missing value {}'.format(cell))
cell_var = header[idx]
row_map[cell_var] = cell['value']

# Yield the row if it is selected
if select is None or select(row_map):
yield row_map

def _execute(self):
""" Execute the query.

Raises:
RuntimeError: on query failure (see error hint).
"""
# Get the API Key and set the headers
if not os.environ.get(_ENV_VAR_API_KEY, None):
raise ValueError(
'Request error: Must set an API key before using the API!')
headers = {'x-api-key': os.environ[_ENV_VAR_API_KEY]}

# Create the query request.
if self._language == self._SPARQL_LANG:
payload = {'sparql': self._query}
url = _API_ROOT + _API_ENDPOINTS['query']
res = requests.post(url, json=payload, headers=headers)

# Verify then store the results.
res_json = res.json()
if 'message' in res_json:
raise RuntimeError('Query error: {}'.format(res_json['message']))
self._result = res.json()
# Get the API Key and perform the POST request.
if not os.environ.get(_ENV_VAR_API_KEY, None):
raise ValueError(
'Request error: Must set an API key before using the API!')
url = _API_ROOT + _API_ENDPOINTS['query']
res = requests.post(url, json={'sparql': query_string}, headers={
'x-api-key': os.environ[_ENV_VAR_API_KEY]
})

# Verify then store the results.
if res.status_code != 200:
raise ValueError(
'Response error: An HTTP {} code was returned by the mixer. Printing '
'response\n\n{}'.format(res.status_code , res.text))
res_json = res.json()

# Iterate through the query results
header = res_json['header']
result_rows = []
for row in res_json['rows']:
# Construct the map from query variable to cell value.
row_map = {}
for idx, cell in enumerate(row['cells']):
if idx > len(header):
raise ValueError(
'Query error: unexpected cell {}'.format(cell))
if 'value' not in cell:
raise ValueError(
'Query error: cell missing value {}'.format(cell))
cell_var = header[idx]
row_map[cell_var] = cell['value']

# Add the row to the result rows if it is selected
if select is None or select(row_map):
result_rows.append(row_map)
return result_rows
Loading