Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 59 additions & 2 deletions bigquery/google/cloud/bigquery/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,8 +773,8 @@ def load_table_from_file(
job_config=None):
"""Upload the contents of this table from a file-like object.

Like load_table_from_uri, this creates, starts and returns
a ``LoadJob``.
Similar to :meth:`load_table_from_uri`, this method creates, starts and
returns a :class:`~google.cloud.bigquery.job.LoadJob`.

Arguments:
file_obj (file): A file handle opened in binary mode for reading.
Expand Down Expand Up @@ -833,6 +833,63 @@ def load_table_from_file(
raise exceptions.from_http_response(exc.response)
return self.job_from_resource(response.json())

def load_table_from_dataframe(self, dataframe, destination,
num_retries=_DEFAULT_NUM_RETRIES,
job_id=None, job_id_prefix=None,
location=None, project=None,
job_config=None):
"""Upload the contents of a table from a pandas DataFrame.

Similar to :meth:`load_table_from_uri`, this method creates, starts and
returns a :class:`~google.cloud.bigquery.job.LoadJob`.

Arguments:
dataframe (pandas.DataFrame):
A :class:`~pandas.DataFrame` containing the data to load.
destination (google.cloud.bigquery.table.TableReference):
The destination table to use for loading the data. If it is an
existing table, the schema of the :class:`~pandas.DataFrame`
must match the schema of the destination table. If the table
does not yet exist, the schema is inferred from the
:class:`~pandas.DataFrame`.

Keyword Arguments:
num_retries (int, optional): Number of upload retries.
job_id (str, optional): Name of the job.
job_id_prefix (str, optional):
The user-provided prefix for a randomly generated
job ID. This parameter will be ignored if a ``job_id`` is
also given.
location (str):
Location where to run the job. Must match the location of the
destination table.
project (str, optional):
Project ID of the project of where to run the job. Defaults
to the client's project.
job_config (google.cloud.bigquery.job.LoadJobConfig, optional):
Extra configuration options for the job.

Returns:
google.cloud.bigquery.job.LoadJob: A new load job.

Raises:
ImportError:
If a usable parquet engine cannot be found. This method
requires one of :mod:`pyarrow` or :mod:`fastparquet` to be
installed.
"""
buffer = six.BytesIO()
dataframe.to_parquet(buffer)

if job_config is None:
job_config = job.LoadJobConfig()
job_config.source_format = job.SourceFormat.PARQUET

return self.load_table_from_file(
buffer, destination, num_retries=num_retries, rewind=True,
job_id=job_id, job_id_prefix=job_id_prefix, location=location,
project=project, job_config=job_config)

def _do_resumable_upload(self, stream, metadata, num_retries):
"""Perform a resumable upload.

Expand Down
4 changes: 2 additions & 2 deletions bigquery/nox.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def default(session):
if session.interpreter == 'python3.4':
session.install('-e', '.')
else:
session.install('-e', '.[pandas]')
session.install('-e', '.[pandas, pyarrow]')

# IPython does not support Python 2 after version 5.x
if session.interpreter == 'python2.7':
Expand Down Expand Up @@ -142,7 +142,7 @@ def snippets(session, py):
os.path.join('..', 'storage'),
os.path.join('..', 'test_utils'),
)
session.install('-e', '.[pandas]')
session.install('-e', '.[pandas, pyarrow]')

# Run py.test against the system tests.
session.run(
Expand Down
1 change: 1 addition & 0 deletions bigquery/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
]
extras = {
'pandas': 'pandas>=0.17.1',
'pyarrow': 'pyarrow>=0.4.1',
}


Expand Down
70 changes: 70 additions & 0 deletions bigquery/tests/unit/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@
import six
from six.moves import http_client
import pytest
try:
import pandas
except (ImportError, AttributeError): # pragma: NO COVER

This comment was marked as spam.

This comment was marked as spam.

This comment was marked as spam.

This comment was marked as spam.

pandas = None
try:
import pyarrow
except (ImportError, AttributeError): # pragma: NO COVER
pyarrow = None

from google.cloud.bigquery.dataset import DatasetReference

Expand Down Expand Up @@ -3484,6 +3492,68 @@ def test_load_table_from_file_bad_mode(self):
with pytest.raises(ValueError):
client.load_table_from_file(file_obj, self.TABLE_REF)

@unittest.skipIf(pandas is None, 'Requires `pandas`')
@unittest.skipIf(pyarrow is None, 'Requires `pyarrow`')
def test_load_table_from_dataframe(self):
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
from google.cloud.bigquery import job

client = self._make_client()
records = [
{'name': 'Monty', 'age': 100},
{'name': 'Python', 'age': 60},
]
dataframe = pandas.DataFrame(records)

This comment was marked as spam.

This comment was marked as spam.


load_patch = mock.patch(
'google.cloud.bigquery.client.Client.load_table_from_file',
autospec=True)
with load_patch as load_table_from_file:
client.load_table_from_dataframe(dataframe, self.TABLE_REF)

load_table_from_file.assert_called_once_with(
client, mock.ANY, self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES,
rewind=True, job_id=None, job_id_prefix=None, location=None,
project=None, job_config=mock.ANY)

sent_file = load_table_from_file.mock_calls[0][1][1]
sent_bytes = sent_file.getvalue()
assert isinstance(sent_bytes, bytes)
assert len(sent_bytes) > 0

sent_config = load_table_from_file.mock_calls[0][2]['job_config']
assert sent_config.source_format == job.SourceFormat.PARQUET

@unittest.skipIf(pandas is None, 'Requires `pandas`')
@unittest.skipIf(pyarrow is None, 'Requires `pyarrow`')
def test_load_table_from_dataframe_w_custom_job_config(self):
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
from google.cloud.bigquery import job

client = self._make_client()
records = [
{'name': 'Monty', 'age': 100},
{'name': 'Python', 'age': 60},
]
dataframe = pandas.DataFrame(records)
job_config = job.LoadJobConfig()

load_patch = mock.patch(
'google.cloud.bigquery.client.Client.load_table_from_file',
autospec=True)
with load_patch as load_table_from_file:
client.load_table_from_dataframe(
dataframe, self.TABLE_REF, job_config=job_config)

load_table_from_file.assert_called_once_with(
client, mock.ANY, self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES,
rewind=True, job_id=None, job_id_prefix=None, location=None,
project=None, job_config=mock.ANY)

sent_config = load_table_from_file.mock_calls[0][2]['job_config']
assert sent_config is job_config
assert sent_config.source_format == job.SourceFormat.PARQUET

# Low-level tests

@classmethod
Expand Down
49 changes: 48 additions & 1 deletion docs/bigquery/snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@
import six
try:
import pandas
except ImportError:
except (ImportError, AttributeError):
pandas = None
try:
import pyarrow
except (ImportError, AttributeError):
pyarrow = None

from google.cloud import bigquery

Expand Down Expand Up @@ -2073,5 +2077,48 @@ def test_list_rows_as_dataframe(client):
assert len(df) == table.num_rows # verify the number of rows


@pytest.mark.skipif(pandas is None, reason='Requires `pandas`')
@pytest.mark.skipif(pyarrow is None, reason='Requires `pyarrow`')
def test_load_table_from_dataframe(client, to_delete):
dataset_id = 'load_table_dataframe_dataset_{}'.format(_millis())
dataset = bigquery.Dataset(client.dataset(dataset_id))
client.create_dataset(dataset)
to_delete.append(dataset)

# [START bigquery_load_table_dataframe]
# from google.cloud import bigquery
# client = bigquery.Client()
# dataset_id = 'my_dataset'

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table('monty_python')
records = [
{'title': 'The Meaning of Life', 'release_year': 1983},
{'title': 'Monty Python and the Holy Grail', 'release_year': 1975},
{'title': 'Life of Brian', 'release_year': 1979},
{
'title': 'And Now for Something Completely Different',
'release_year': 1971
},
]
# Optionally set explicit indices.
# If indices are not specified, a column will be created for the default
# indices created by pandas.
index = ['Q24980', 'Q25043', 'Q24953', 'Q16403']
dataframe = pandas.DataFrame(
records, index=pandas.Index(index, name='wikidata_id'))

job = client.load_table_from_dataframe(dataframe, table_ref, location='US')

job.result() # Waits for table load to complete.

assert job.state == 'DONE'
table = client.get_table(table_ref)
assert table.num_rows == 4
# [END bigquery_load_table_dataframe]
column_names = [field.name for field in table.schema]
assert sorted(column_names) == ['release_year', 'title', 'wikidata_id']


if __name__ == '__main__':
pytest.main()